diff --git a/CMakeLists.txt b/CMakeLists.txt
index b309ff37e52b4fd28b14925bdd7e3740e1e2fa47..5df83499d5dde29b205ee17fba81a63c9a643235 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,8 +16,6 @@ cmake_minimum_required(VERSION 3.0)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
-SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
-SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 
 include(system)
 
@@ -201,6 +199,10 @@ if(WITH_GOLANG)
 endif(WITH_GOLANG)
 
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
+
+SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+
 add_subdirectory(paddle)
 if(WITH_PYTHON)
   add_subdirectory(python)
diff --git a/README.md b/README.md
index ceeb6d9e5193763293d3fce76e464340fbce533f..577528e7aaf45ce002467590ec66b19afb145920 100644
--- a/README.md
+++ b/README.md
@@ -61,32 +61,32 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 ## Installation
 
 It is recommended to check out the
-[Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html).
+[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/build_from_source_en.html).
 
 ## Documentation
 
-We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
-[Chinese](http://doc.paddlepaddle.org/doc_cn/) documentation.
+We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and
+[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation.
 
-- [Deep Learning 101](http://book.paddlepaddle.org/index.html)
+- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/cluster_train_en.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Distributed Training on Kubernetes](http://doc.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/k8s_en.html)
 
    You can also run distributed training jobs on Kubernetes clusters.
 
-- [Python API](http://doc.paddlepaddle.org/develop/doc/api/index_en.html)
+- [Python API](http://www.paddlepaddle.org/docs/develop/documentation/en/api/index_en.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://doc.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html)
 
    We appreciate your contributions!
 
diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
index 8ee7fd28c58f2a2bcb82040eb824a37062bd4e9c..6cc9598947acbdacfbf4c4379987bab8ed7611b0 100644
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -22,6 +22,7 @@ On each machine, we will test and compare the performance of training on single
 
 #### Training
 Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+Pay attetion that the speed below includes forward, backward and parameter update time. So we can not directly compare the data with the benchmark of caffe `time` [command](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/caffe/image/run.sh#L9), which only contain forward and backward. The updating time of parameter would become very heavy when the weight size are large, especially on alexnet.
 
 Input image size - 3 * 224 * 224, Time: images/second
 
@@ -55,6 +56,16 @@ Input image size - 3 * 224 * 224, Time: images/second
 
 <img src="figs/googlenet-cpu-train.png" width="500">
 
+- Alexnet
+
+| BatchSize    | 64     | 128    | 256    |
+|--------------|--------| ------ | -------|
+| OpenBLAS     | 2.13   | 2.45   | 2.68   | 
+| MKLML        | 66.37  | 105.60 | 144.04 |
+| MKL-DNN      | 399.00 | 498.94 | 626.53 | 
+
+chart TBD
+
 #### Inference
 Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
 - VGG-19
diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py
index 3358d43a4b08c6a9b89d59e1a8be53ee1f12bbe0..77d130ae34059d1e87040d00346ac1dadd86b0d8 100644
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@@ -6,8 +6,18 @@ height = 227
 width = 227
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
+gp = get_config_arg('layer_num', int, 1)
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
 
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer,
+    'num_samples': num_samples
+}
 define_py_data_sources2(
     "train.list", None, module="provider", obj="process", args=args)
 
@@ -31,7 +41,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2)
 
 # conv2
 net = img_conv_layer(
-    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
 net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
 net = img_pool_layer(input=net, pool_size=3, stride=2)
 
@@ -40,11 +50,11 @@ net = img_conv_layer(
     input=net, filter_size=3, num_filters=384, stride=1, padding=1)
 # conv4
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
 
 # conv5
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
 net = img_pool_layer(input=net, pool_size=3, stride=2)
 
 net = fc_layer(
@@ -59,6 +69,9 @@ net = fc_layer(
     layer_attr=ExtraAttr(drop_rate=0.5))
 net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
 
-lab = data_layer('label', num_class)
-loss = cross_entropy(input=net, label=lab)
-outputs(loss)
+if is_infer:
+    outputs(net)
+else:
+    lab = data_layer('label', num_class)
+    loss = cross_entropy(input=net, label=lab)
+    outputs(loss)
diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py
index 7059c13bd2c2b98eb3fbcf633a6f7064e54d5402..2a850ccb7f2c75b467554181fc5f4aa8f2b97a09 100644
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
 use_gpu = get_config_arg('use_gpu', bool, True)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
 
 args = {
     'height': height,
     'width': width,
     'color': True,
     'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
     "train.list" if not is_infer else None,
diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py
index 927b1759941f362ef4b5ffe84dd01332986d9306..1018ec9ce1e529f618ddd7b7afa72a84c5e876a1 100644
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -14,6 +14,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
     else:
         settings.data_size = settings.height * settings.width
     settings.is_infer = kwargs.get('is_infer', False)
+    settings.num_samples = kwargs.get('num_samples', 2560)
     if settings.is_infer:
         settings.slots = [dense_vector(settings.data_size)]
     else:
@@ -23,7 +24,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
 @provider(
     init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_list):
-    for i in xrange(2560 if settings.is_infer else 1024):
+    for i in xrange(settings.num_samples):
         img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
         if settings.is_infer:
             yield img.astype('float32')
diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py
index 4a14363ff1db48a5072cbb5f5eb3bc9241ffca8f..2846e4763f1cda4602f03af5ec649d57ee6cf0d8 100644
--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg("layer_num", int, 50)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
 
 args = {
     'height': height,
     'width': width,
     'color': True,
     'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
     "train.list" if not is_infer else None,
diff --git a/benchmark/paddle/image/run_mkldnn_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh
similarity index 95%
rename from benchmark/paddle/image/run_mkldnn_infer.sh
rename to benchmark/paddle/image/run_mkl_infer.sh
index d795bcab1b7d098295066f79189d17e8299d28fb..62c9bf6efd3810f506fd4592b2ba3a21b1b7f0e7 100755
--- a/benchmark/paddle/image/run_mkldnn_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@@ -37,7 +37,7 @@ function infer() {
       --trainer_count=1 \
       --num_passes=1 \
       --save_dir="models/${topology}-${layer_num}" \
-      --config_args="batch_size=128,layer_num=${layer_num}" \
+      --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
       > /dev/null 2>&1
     echo "Done"
   fi
@@ -79,8 +79,9 @@ fi
 # inference benchmark
 for use_mkldnn in True False; do
   for batchsize in 1 2 4 8 16; do
-    infer googlenet v1 $batchsize $use_mkldnn
-    infer resnet 50 $batchsize $use_mkldnn
     infer vgg 19 $batchsize $use_mkldnn
+    infer resnet 50 $batchsize $use_mkldnn
+    infer googlenet v1 $batchsize $use_mkldnn
+    infer alexnet 2 $batchsize $use_mkldnn
   done
 done
diff --git a/benchmark/paddle/image/run_mkldnn_train.sh b/benchmark/paddle/image/run_mkl_train.sh
similarity index 83%
rename from benchmark/paddle/image/run_mkldnn_train.sh
rename to benchmark/paddle/image/run_mkl_train.sh
index 320206239ae960bd088b05d3b10934a98da741b1..03d2d378fb72e36f765d89af788f6ee96fe21d4e 100755
--- a/benchmark/paddle/image/run_mkldnn_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@@ -28,6 +28,10 @@ function train() {
     --test_period=100 \
     --config_args=$args \
     2>&1 | tee ${log} 
+
+  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 }
 
 if [ ! -f "train.list" ]; then
@@ -43,5 +47,6 @@ for use_mkldnn in True False; do
     train vgg 19 $batchsize $use_mkldnn
     train resnet 50 $batchsize $use_mkldnn
     train googlenet v1 $batchsize $use_mkldnn
+    train alexnet 2 $batchsize $use_mkldnn
   done
 done
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
new file mode 100755
index 0000000000000000000000000000000000000000..da034f3b9dff794e22086a5295ad2b0c2361c356
--- /dev/null
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -0,0 +1,64 @@
+set -e
+
+function clock_to_seconds() {
+  hours=`echo $1 | awk -F ':' '{print $1}'`
+  mins=`echo $1 | awk -F ':' '{print $2}'`
+  secs=`echo $1 | awk -F ':' '{print $3}'`
+  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
+}
+
+function infer() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  thread=`nproc`
+  if [ $thread -gt $bs ]; then
+    thread=$bs
+  fi
+  log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+
+  models_in="models/${topology}-${layer_num}/pass-00000/"
+  if [ ! -d $models_in ]; then
+    echo "./run_mkl_infer.sh to save the model first"
+    exit 0
+  fi
+  log_period=$((32 / bs))
+  paddle train --job=test \
+    --config="${topology}.py" \
+    --use_mkldnn=False \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=$log_period \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
+    --init_model_path=$models_in \
+    2>&1 | tee ${log}
+
+  # calculate the last 5 logs period time of 160(=32*5) samples,
+  # the time before are burning time.
+  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  start_sec=`clock_to_seconds $start`
+  end_sec=`clock_to_seconds $end`
+  fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
+  echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -f "test.list" ]; then
+  echo " " > test.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# inference benchmark
+for batchsize in 1 2 4 8 16; do
+  infer vgg 19 $batchsize
+  infer resnet 50 $batchsize 
+  infer googlenet v1 $batchsize
+  infer alexnet 2 $batchsize
+done
diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e9df83fee2a3f796b7234b39619364f6ee4d5dc9
--- /dev/null
+++ b/benchmark/paddle/image/run_openblas_train.sh
@@ -0,0 +1,41 @@
+set -e
+
+function train() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  thread=`nproc`
+  # each trainer_count use only 1 core to avoid conflict
+  log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+  args="batch_size=${bs},layer_num=${layer_num}"
+  config="${topology}.py"
+  paddle train --job=time \
+    --config=$config \
+    --use_mkldnn=False \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=3 \
+    --test_period=30 \
+    --config_args=$args \
+    2>&1 | tee ${log} 
+
+  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# training benchmark
+for batchsize in 64 128 256; do
+  train vgg 19 $batchsize
+  train resnet 50 $batchsize
+  train googlenet v1 $batchsize
+  train alexnet 2 $batchsize
+done
diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py
index 8d0a1e97a451cd52ef17e4e326673cc90059ef3c..ca0a6798fb8c35b68cf84d263855955eb93ba0b0 100644
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg('layer_num', int, 19)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
 
 args = {
     'height': height,
     'width': width,
     'color': True,
     'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
     "train.list" if not is_infer else None,
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index fab2af362bb070a54987b6499748056f3d12a56b..ff5855052dabaa0b63099cd219f3f04e22f1aa85 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -253,9 +253,9 @@ IF(NOT PROTOBUF_FOUND)
     IF(WITH_C_API)
         INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
         IF(ANDROID)
-            INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
         ELSE()
-            INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib)
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib)
         ENDIF()
     ENDIF()
 
diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst
index 9be0b370ee5e301aee4a6e31b1cfa905754968e8..84f9097a6cdc2da269bd6a0685796e14e26da37e 100644
--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
@@ -7,3 +7,4 @@ API
     模型配置 <v2/model_configs.rst>
     数据访问 <v2/data.rst>
     训练与应用 <v2/run_logic.rst>
+    v2/fluid.rst
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index c3f9c18d0663a7a24880b441981875c1e4f015aa..ddf0b055a92d80295b24255a5462d477e0d9c796 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -252,6 +252,11 @@ first_seq
 ..  autoclass:: paddle.v2.layer.first_seq
     :noindex:
 
+sub_seq
+---------
+..  autoclass:: paddle.v2.layer.sub_seq
+    :noindex:
+
 concat
 ------
 ..  autoclass:: paddle.v2.layer.concat
@@ -467,7 +472,7 @@ lambda_cost
     :noindex:
 
 square_error_cost
---------
+-----------------
 ..  autoclass:: paddle.v2.layer.square_error_cost
     :noindex:
 
@@ -533,7 +538,7 @@ Miscs
 =====
 
 dropout
---------------
+--------
 ..  autoclass:: paddle.v2.layer.dropout
     :noindex:
 
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index 89e5fec13bf9062dc7a7187b1334c8f5486a980b..004ee2d8c85ce7661886179570e693d7d61bc6d8 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -19,17 +19,17 @@ dynamic_lstm
     :noindex:
 
 data
----------
+----
 ..  autofunction:: paddle.v2.fluid.layers.data
     :noindex:
 
 mean
----------
+----
 ..  autofunction:: paddle.v2.fluid.layers.mean
     :noindex:
 
 mul
----------
+---
 ..  autofunction:: paddle.v2.fluid.layers.mul
     :noindex:
 
@@ -45,13 +45,13 @@ elementwise_div
 
 
 dropout
----------
+-------
 ..  autofunction:: paddle.v2.fluid.layers.dropout
     :noindex:
 
 
 reshape
----------
+--------
 ..  autofunction:: paddle.v2.fluid.layers.reshape
     :noindex:
 
@@ -68,12 +68,6 @@ scale
     :noindex:
 
 
-reshape
----------
-..  autofunction:: paddle.v2.fluid.layers.reshape
-    :noindex:
-
-
 transpose
 ---------
 ..  autofunction:: paddle.v2.fluid.layers.transpose
@@ -81,67 +75,67 @@ transpose
 
 
 sigmoid_cross_entropy_with_logits
----------
+---------------------------------
 ..  autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
     :noindex:
 
 
 cast
----------
+----
 ..  autofunction:: paddle.v2.fluid.layers.cast
     :noindex:
 
 
 concat
----------
+-------
 ..  autofunction:: paddle.v2.fluid.layers.concat
     :noindex:
 
 
 sums
----------
+----
 ..  autofunction:: paddle.v2.fluid.layers.sums
     :noindex:
 
 
 linear_chain_crf
----------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
     :noindex:
 
 
 assign
----------
+-------
 ..  autofunction:: paddle.v2.fluid.layers.embedding
     :noindex:
 
 
 split_lod_tensor
----------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
     :noindex:
 
 
 merge_lod_tensor
----------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
     :noindex:
 
 cos_sim
----------
+--------
 ..  autofunction:: paddle.v2.fluid.layers.cos_sim
     :noindex:
 
 
 cross_entropy
----------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.cross_entropy
     :noindex:
 
 
 
 square_error_cost
----------
+-----------------
 ..  autofunction:: paddle.v2.fluid.layers.square_error_cost
     :noindex:
 
@@ -153,74 +147,80 @@ accuracy
 
 
 sequence_conv
----------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.sequence_conv
     :noindex:
 
 
 conv2d
----------
+------
 ..  autofunction:: paddle.v2.fluid.layers.conv2d
     :noindex:
 
 
 sequence_pool
----------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.sequence_pool
     :noindex:
 
 
+sequence_first_step
+-------------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
+    :noindex:
+
+
+sequence_last_step
+------------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
+    :noindex:
+
+
 pool2d
----------
+------
 ..  autofunction:: paddle.v2.fluid.layers.pool2d
     :noindex:
 
 
 batch_norm
----------
+----------
 ..  autofunction:: paddle.v2.fluid.layers.batch_norm
     :noindex:
 
 
 beam_search_decode
----------
+------------------
 ..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
     :noindex:
 
 
-lstm
----------
-..  autofunction:: paddle.v2.fluid.layers.lstm
-    :noindex:
-
-
 lod_rank_table
----------
+--------------
 ..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
     :noindex:
 
 
 max_sequence_len
----------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
     :noindex:
 
 
 topk
----------
+-----
 ..  autofunction:: paddle.v2.fluid.layers.topk
     :noindex:
 
 
 lod_tensor_to_array
----------
+-------------------
 ..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
     :noindex:
 
 
 
 array_to_lod_tensor
----------
+-------------------
 ..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
     :noindex:
 
@@ -228,26 +228,26 @@ array_to_lod_tensor
 
 
 fill_constant
----------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.fill_constant
     :noindex:
 
 
 
 fill_constant_batch_size_like
----------
+-----------------------------
 ..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
     :noindex:
 
 
 ones
----------
+----
 ..  autofunction:: paddle.v2.fluid.layers.ones
     :noindex:
 
 
 zeros
----------
+-----
 ..  autofunction:: paddle.v2.fluid.layers.zeros
     :noindex:
 
@@ -259,14 +259,14 @@ increment
 
 
 array_write
----------
+-----------
 ..  autofunction:: paddle.v2.fluid.layers.array_write
     :noindex:
 
 
 
 create_array
----------
+------------
 ..  autofunction:: paddle.v2.fluid.layers.create_array
     :noindex:
 
@@ -278,25 +278,67 @@ less_than
 
 
 array_read
----------
+----------
 ..  autofunction:: paddle.v2.fluid.layers.array_read
     :noindex:
 
 
 shrink_memory
----------
+--------------
 ..  autofunction:: paddle.v2.fluid.layers.shrink_memory
     :noindex:
 
 
 array_length
----------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.array_length
     :noindex:
 
 
 conv2d_transpose
----------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
     :noindex:
 
+
+sequence_expand
+---------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+    :noindex:
+
+
+lstm_unit
+---------
+..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+    :noindex:
+
+
+sequence_softmax
+----------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
+    :noindex:
+
+
+reduce_sum
+----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_sum
+    :noindex:
+
+
+reduce_mean
+-----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_mean
+    :noindex:
+
+
+reduce_max
+----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_max
+    :noindex:
+
+
+reduce_min
+----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_min
+    :noindex:
+
diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst
index 2c3d075422de29c96e25458e831133a30270dd39..b792efb71f85ae643df655568da69c82414e9d5d 100644
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
@@ -3,19 +3,19 @@ Nets
 ===========
 
 simple_img_conv_pool
------------
+--------------------
 ..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
     :noindex:
 
 
 img_conv_group
------------
+---------------
 ..  autofunction:: paddle.v2.fluid.nets.img_conv_group
     :noindex:
 
 
 sequence_conv_pool
------------
+------------------
 ..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
     :noindex:
 
diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst
index 233762fcdfb39e592740adef6721a556fae3feef..19b4940f08de3e2f7dc177f2961e538946d10a78 100644
--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
@@ -18,7 +18,7 @@ SGDOptimizer
 
 
 MomentumOptimizer
------------
+-----------------
 ..  automodule:: paddle.v2.fluid.optimizer
     :members: MomentumOptimizer
     :noindex:
@@ -26,14 +26,14 @@ MomentumOptimizer
 
 
 AdagradOptimizer
------------
+----------------
 ..  automodule:: paddle.v2.fluid.optimizer
     :members: AdagradOptimizer
     :noindex:
 
 
 AdamOptimizer
------------
+-------------
 ..  automodule:: paddle.v2.fluid.optimizer
     :members: AdamOptimizer
     :noindex:
@@ -47,7 +47,7 @@ AdamaxOptimizer
 
 
 DecayedAdagradOptimizer
------------
+-----------------------
 ..  automodule:: paddle.v2.fluid.optimizer
     :members: DecayedAdagradOptimizer
     :noindex:
diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst
index 3af2b07d2ae55d99df705fbf1ad2402eee05c435..868e225ed3d59e79aeb217fb88081ea25f80fa2c 100644
--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
@@ -3,14 +3,14 @@ Regularizer
 ===========
 
 WeightDecayRegularizer
------------
+----------------------
 ..  automodule:: paddle.v2.fluid.regularizer
     :members: WeightDecayRegularizer
     :noindex:
 
 
 L2DecayRegularizer
------------
+------------------
 ..  automodule:: paddle.v2.fluid.regularizer
     :members: L2DecayRegularizer
     :noindex:
@@ -18,7 +18,7 @@ L2DecayRegularizer
 
 
 L1DecayRegularizer
------------
+-------------------
 ..  automodule:: paddle.v2.fluid.regularizer
     :members: L1DecayRegularizer
 
diff --git a/doc/design/block.md b/doc/design/block.md
index 4066122c0e8dfa33776796c3d205ba5aec9e0f52..fab7f2dc481ae51aa982164dc5048d90fcdc2b0b 100644
--- a/doc/design/block.md
+++ b/doc/design/block.md
@@ -291,10 +291,10 @@ public:
   }
 
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& place) const override {
     PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
     for (auto& op : runtime_table_.ops()) {
-      op->Run(scope, dev_ctx);
+      op->Run(scope, place);
     }
   }
 
diff --git a/doc/design/executor.md b/doc/design/executor.md
index b5fb6c5c3c1da3c112ce63878322083dd5c42b70..2d4b371cc56db82ce5747da6db07f05aa7f7e6c1 100644
--- a/doc/design/executor.md
+++ b/doc/design/executor.md
@@ -1,23 +1,29 @@
 # Executor Design Doc
 
 ## Motivation
+In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
+[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
 
-We use executor to do the runtime evaluation of a `ProgramDesc`.
+The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
 
 ## Overview
 
-An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
+An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
 
-### What does executor do?
+## Executor
 
-It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
+The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
+It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
 
-### What does executor NOT do?
+### The interface
+```c++
+  Executor(places);
+```
+A executor does not own any computing resources, a user can only construct an executor using the specified places.
 
-It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
+### Running an Executor
 
-It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
-
-## Implementation
-
-`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
+```
+  void Run(ProgramDesc, Scope, block_id, create_local_scope);
+```
+An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
diff --git a/doc/design/kernel_hint_design.md b/doc/design/kernel_hint_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..a54b7da045e1a362626ef066f9ebb56af2c3181a
--- /dev/null
+++ b/doc/design/kernel_hint_design.md
@@ -0,0 +1,57 @@
+## Problem
+In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
+
+In the current design, we use KernelType to describe one kernel.
+
+```cpp
+struct KernelType {
+  Place place_;
+  DataType data_type_;
+  LayoutType layout_;
+};
+```
+ `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
+
+The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
+
+So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
+
+The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
+
+## Solution
+
+### Potential choice
+1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
+
+2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
+
+### Final choice
+To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
+
+In C++
+
+```cpp
+const std::string kForceCPU = "force_cpu";
+const std::string kUseCUDNN = "use_cudnn";
+const std::string kUseMKLDNN = "use_mkldnn";
+
+KernelType GetExpectedKernelType() {
+  if (Attr<bool>(kForceCPU)) {
+    return KernelType(CPUPlace, ...)
+  } else {
+    ...
+  }
+}
+```
+
+In Python code
+
+```python
+FORCE_CPU = core.kForceCPU()
+
+def xx_layer(..., force_cpu=false):
+  layer_helper = LayerHelper(...)
+  layer_helper.append_op(
+    type="xx",
+    attr={FORCE_CPU: force_cpu})
+```
diff --git a/doc/design/mkldnn/image/engine.png b/doc/design/mkl/image/engine.png
similarity index 100%
rename from doc/design/mkldnn/image/engine.png
rename to doc/design/mkl/image/engine.png
diff --git a/doc/design/mkldnn/image/gradients.png b/doc/design/mkl/image/gradients.png
similarity index 100%
rename from doc/design/mkldnn/image/gradients.png
rename to doc/design/mkl/image/gradients.png
diff --git a/doc/design/mkldnn/image/layers.png b/doc/design/mkl/image/layers.png
similarity index 100%
rename from doc/design/mkldnn/image/layers.png
rename to doc/design/mkl/image/layers.png
diff --git a/doc/design/mkldnn/image/matrix.png b/doc/design/mkl/image/matrix.png
similarity index 100%
rename from doc/design/mkldnn/image/matrix.png
rename to doc/design/mkl/image/matrix.png
diff --git a/doc/design/mkldnn/image/overview.png b/doc/design/mkl/image/overview.png
similarity index 100%
rename from doc/design/mkldnn/image/overview.png
rename to doc/design/mkl/image/overview.png
diff --git a/doc/design/mkl/mkl_packed.md b/doc/design/mkl/mkl_packed.md
new file mode 100644
index 0000000000000000000000000000000000000000..0123315ad4368e68b377f66119949bfd6c1c7860
--- /dev/null
+++ b/doc/design/mkl/mkl_packed.md
@@ -0,0 +1,108 @@
+# Intel® MKL Packed on PaddlePaddle: Design Doc
+
+
+## Contents
+
+- [Overview](#overview)
+- [Key Points](#key-points) 
+   - [Background](#background)
+   - [Solution](#solution)
+- [Actions](#actions)
+    - [CMake](#cmake)
+	- [Layers](#layers)
+	- [Unit Tests](#unit-tests)
+	- [Python API](#python-api)
+	- [Benchmarking](#benchmarking)
+
+
+## Overview
+我们计划将 Intel® MKL 中引入的 GEMM Packed APIs\[[1](#references)\] 集成到 PaddlePaddle 中，充分发挥英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+现阶段的优化主要针对 Recurrent Neural Network（以下简称RNN）相关层（包括`RecurrentLayer`, `GatedRecurrentLayer`和`LstmLayer`）， 以及 PaddlePaddle V1 API。
+
+## Key Points
+
+### Background
+目前PaddlePaddle采用了 Intel® MKL库的[cblas_?gemm](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm)函数，这个函数本身会在计算前将原数据转换为更适合英特尔平台的内部格式。
+
+1. 转换耗时 \
+这一数据格式的转换操作（Packing），在问题本身的计算量比较小的时候，显得相对来说较为耗时。例如在DeepSpeech2 \[[2](#references)\] 的Vanilla RNN部分中，矩阵大小是`batch_size * 2048`。
+2. 转换冗余 \
+由于在现有的某些情况下（例如RNN），多次调用 cblas_?gemm 会使用相同的原数据，因此，每次调用时对原数据的重复Packing便成为了冗余。
+
+为了最大程度减少多次调用 cblas_?gemm 在Packing上的耗时，Intel® MKL 引入了以下四个API:
+   * [cblas_?gemm_alloc](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-alloc)
+   * [cblas_?gemm_pack](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-pack)
+   * [cblas_?gemm_compute](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-compute)
+   * [cblas_?gemm_free](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-free)
+
+通过使用这些API，我们可以先完成对原数据的Packing操作，再把已转换为Packed格式的数据传递给那些复用同一数据的gemm_compute函数，从而避免了Packing冗余。
+
+### Solution
+在RNN的情况下，同一次前向、后向（forward/backward）过程中所有时间步（time step）共享同一个权重（weight）。当只做推断（inference）时，各次前向之间也都使用了相同的权重，没有必要在每次前向中每个时间步的计算时对权重进行重复的Packing操作。
+
+我们通过使用新引入的GEMM Packed APIs，在层初始化的时候，先完成对权重的Packing操作，然后在前向，后向时复用已经转换过的权重，并在每次权重更新后，对新的权重进行转换用于下次迭代。
+
+* 优化前，对于序列长度（sequence length）为`T`的网络模型（model）, `N`次迭代执行的转换次数为：
+  - `inference`： `N * T`  
+  - `training`： `2 * N * T`
+* 优化后，对于同样设置的网络模型，其转换次数减少至：
+  - `inference`： `1`    
+  - `training`： `2 * N`
+
+## Actions
+
+添加的相关文件和目录结构如下：
+
+```txt
+PaddlePaddle/Paddle
+├── ...
+└── paddle/
+    ├── ...
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   ├── MKLPackedRecurrentLayer.*
+        |   ├── MKLPackedGatedRecurrentLayer.*
+        |   ├── MKLPackedLstmLayer.*
+        |   └── MKLPackedGemm.h
+        └── tests/
+            ├── ...
+            └── test_MKLPacked.cpp
+```
+
+### CMake
+在对应的`CMakeLists.txt`中根据`WITH_MKL`是否打开，来决定是否开启MKL Packed相关功能。
+
+### Layers
+所有的`MKLPacked*Layer`都继承于PaddlePaddle的基类`Layer`, 并添加头文件 `MKLPackedGemm.h`，该文件对相关GEMM Packed APIs做了封装。
+
+### Unit Tests
+我们会添加`test_MKLPacked.cpp`用于MKL Packed优化后layer的测试。
+对于每一个新加的RNN layer，我们会对比如下2个方面：
+1. 对比优化后layer自身，sequence mode（`rnn_use_batch=false`）与batch mode(`rnn_use_batch=true`)的结果。
+2. 对比优化后layer与相对应的PaddlePaddle原有layer, 在batch mode下的结果。
+
+### Python API
+计划在`paddle/utils.Flags`中添加`use_mkl_packed`的flag，用于选择是否使用相关功能，并且当编译时`WITH_MKL=ON`的情况下，默认设置为`true`。
+
+同时，在`python/paddle/trainer/config_parser.py`中对应的layer处，添加`use_mkl_packed`这个选择，方便用户在Python端选择是否启用这个功能。
+
+具体实现方式比如：
+
+```python
+use_mkl_packed = bool(int(g_command_config_args.get("use_mkl_packed", 0)))
+if use_mkl_packed:
+    self.layer_type = mkl_packed_*
+```
+
+所有相关的`layer_type`会以*mkl_packed_*开头，这些会在`MKLPacked*Layer`注册layer的时候保证，以示区分。 
+
+
+### Benchmarking
+会添加相应的脚本用于测试和对比在使用MKL Packed recurrent layers 前后的网络性能。
+
+## References 
+1. [Introducing the new Packed APIs for GEMM](https://software.intel.com/en-us/articles/introducing-the-new-packed-apis-for-gemm)
+2. [DeepSpeech2 on PaddlePaddle](https://github.com/PaddlePaddle/DeepSpeech#deepspeech2-on-paddlepaddle)
+
diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkl/mkldnn.md
similarity index 99%
rename from doc/design/mkldnn/README.MD
rename to doc/design/mkl/mkldnn.md
index 61d453de243c25defc56161641bc4a888a88a3b7..e2fe1e6b26ffa73fda81863abfadf697c0acbfcf 100644
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkl/mkldnn.md
@@ -208,4 +208,3 @@ if use_mkldnn
 但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
 4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`，所以不存在这个问题)。
 所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
-
diff --git a/doc/design/mkl/mkldnn_fluid.md b/doc/design/mkl/mkldnn_fluid.md
new file mode 100644
index 0000000000000000000000000000000000000000..bef126f3f0577b69f646dfe5d10539b372c6a8a5
--- /dev/null
+++ b/doc/design/mkl/mkldnn_fluid.md
@@ -0,0 +1,149 @@
+# Design Doc: Add MKLDNN Kernel in Fluid Operator
+
+## Principles
+
+First of all, we should follow some basical principles like:
+1.  [How to write a new operator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md). We are trying to add a new kind of kernel into operators, so basically we should follow this doc.
+2.  [Supporting new Device/Library](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md). Since MKLDNN is a new library to fluid, we should add `MKLDNNDeviceContext` and maybe `mkldnn_helper.h`, just like [cudnn_helper.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/cudnn_helper.h).
+3.  [Switch Kernel](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). Another important point is that we should ensure the data synchronization between different kernel types, which is this [topic](https://github.com/PaddlePaddle/Paddle/issues/6549). So basically we should override `GetExpectedKernelType` and `trans` functions to support switching kernels.
+4.  [The Keys of Operator Kernel Type](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). Kernel Type is a pivotal conception which can record the `Place`, `Library`, `DataType` and `Layout`.
+
+## Sulution
+
+In general, there are four parts we should follow to run a MKL-DNN primitive.
+-  Create a primitive descriptor that describe this operator
+-  Create a primitive itself by primitive descriptor and the engine
+-  Create all memory buffers that primitive needed
+-  Launch a stream to execute the primitive created
+More details can refer to [here](http://01org.github.io/mkl-dnn).
+
+It's better to avoid reinitialization of primitives and memory handles in the first three stages in every iteration. \
+So we plan to create a map to record all the `primitive` and `memory`, which should not take too much memories as discussed [here](https://github.com/PaddlePaddle/Paddle/issues/6822).
+
+It's assumed that following three conditions should be satisfied.
+1. there is a unique key for each operator instance. May be the actual name of `Output Tensor`.
+2. the `Input Tensor` inside `Compute` function is the one after converted.
+3. we can get the phase(eg. `is_test`) inside `Compute` function, otherwise we need to expose this attribue to user.
+
+### Compute
+The algorithm of `Compute` would be described as follow, let's take conv like an example.
+
+```c++
+
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace.");
+  PADDLE_ENFORCE(platform::is_mkldnn_library(ctx.GetLibrary()), "It must use MKLDNN Library.");
+
+  auto& dev_ctx = ctx.template device_context<platform::MKLDNNDeviceContext>();
+
+  // find primitive by unique key from mkldnn context
+  // the op_key should be a unique name of this op instance
+  auto& p = dev_ctx.findPrimitive(op_key + "_fwd");
+
+  // assuming the input tensor inside this compute function is the one after converted
+  // this point should be guarantee by another mechanism
+  auto& i = dev_ctx.findMemory(op_key + "_input");
+  
+  if (p == nullptr || i == nullptr || inputSizeChanged(p, i))  {
+    auto fwd_primitive_desc = createPrimitiveDesc(ctx);
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+    shared_ptr<mkldnn::memory> in(new mkldnn::memory(fwd_primitive_desc->src_primitive_desc(), input->data<T>()));
+    shared_ptr<mkldnn::memory> wgt(new mkldnn::memory(fwd_primitive_desc->weights_primitive_desc(), filter->data<T>()));
+    shared_ptr<mkldnn::memory> out(new mkldnn::memory(fwd_primitive_desc->dst_primitive_desc(), output->mutable_data<T>(ctx.GetPlace())));
+    shared_ptr<mkldnn::conv_fwd> fwd_primitive(new mkldnn::conv_fwd(*fwd_primitive_desc, *in, *wgt, *out));
+
+    dev_ctx.addMemory(op_key+"_input", in);
+    dev_ctx.addMemory(op_key+"_output", out);
+    dev_ctx.addMemory(op_key+"_filer", wgt);
+    dev_ctx.addPrimitive(op_key+"_fwd", fwd_primitive);
+    dev_ctx.addPrimitiveDesc(op_key+"_fwd_PD", fwd_primitive_desc);
+  }
+
+  p = dev_ctx.findPrimitive(op_key + "_fwd");
+
+  PADDLE_ENFORCE(p, "Should have forward Primitive");
+  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_input"), "Should have input memory");
+  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_output"), "Should have output memory");
+  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_filter"), "Should have filter memory");
+  PADDLE_ENFORCE(dev_ctx.findPrimitiveDesc(op_unique_key+"_fwd_PD"), "Should have forward PrimitiveDesc");
+  dev_ctx.submit(p);
+  dev_ctx.execute();  // the convert primitive should have already contained.
+
+```
+
+The `createPrimitiveDesc` returns the primitive descripotor of this operator, would be like this:
+```c++
+  auto* input = ctx.Input<Tensor>("Input");
+  auto* filter = ctx.Input<Tensor>("Filter");
+  auto* output = ctx.Output<Tensor>("Output");
+  std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+  std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+  int groups = ctx.Attr<int>("groups");
+  algorithm algo = static_cast<algorithm>(ctx.Attr<int>("convolution_algorithm_option"));
+  prop_kind pk = ctx.Attr<bool>("is_test") ? prop_kind::forward_inference : prop_kind::forward_training;
+    
+  auto fwd_desc = mkldnn::conv_fwd::desc(/* all the setting above*/);
+  shared_ptr<mkldnn::conv_fwd::primitive_desc> fwd_primitive_desc(new mkldnn::conv_fwd::primitive_desc(fwd_desc, ctx.getEngine()));
+
+  return fwd_primitive_desc;
+  }
+```
+
+### MKLDNNDeviceContext
+`MKLDNNDeviceContext`, which is very straightforward, should contain some base information like: `stream`, `engine` and the map needed.
+
+
+### mkldnn_helper
+Some functions would be put in `paddle/platform/mkldnn_helper.h`.
+- create MKLDNN memories
+- create MKLDNN primitives
+- error check function
+- etc
+
+
+### Kernel Switch
+We should `reorder` the different Layout from other device or to other device. `GetExpectedKernelType` and `trans` functions can help us to implement it.
+
+`GetExpectedKernelType` should get the context, and this operator can return the best `KernelType`. 
+`trans` would be like this:
+
+```c++
+void trans(inputs, ctx) override {
+  if (NoNeedTrans()) {
+    return;
+  }
+  // find reorder primitive by op_key from context
+  auto& dev_ctx = ctx.template device_context<platform::MKLDNNDeviceContext>();
+  auto& p = dev_ctx.findPrimitive(op_key + "_reorder_input");
+  auto& i = dev_ctx.findMemory(op_key + "_src_input");
+
+  if (p == nullptr || i == nullptr || changeSized(i, input)) {
+    auto prim = createPrimitiveDesc(ctx);
+    auto src = createMemory(memoryDesc(input->dims(), actual_layout), input->data);
+    auto newbuffer = paddle::memory::Alloc(ctx.GetPlace(), input->size_in_bytes());
+    auto dst = createMemory(p->expected_desc(), newbuffer->data);
+    auto reorder_primitive(new mkldnn::reorder(src, dst));
+
+    dev_ctx.addMemory(op_key+"_src_input", src);
+    dev_ctx.addMemory(op_key+"_input", dst);
+    dev_ctx.addPrimitive(op_key+"_reorder_input", reorder_primitive);
+  }
+
+  p = dev_ctx.findPrimitive(op_key + "_reorder_input");
+  PADDLE_ENFORCE(p, "Should have Reorder Primitive");
+  dev_ctx.submit(p);
+  if (! this->isMKLDNNKernel()) {
+    // execute immediately only if this is not mkldnn kernel function.
+    // otherwise, it can be executed with the operator primitive in Compute
+    dev_ctx.stream();
+  }
+  // after submit, the input tensor in ExecutionContext should be changed as the converted one
+  // there should be another mechanism to ensure this
+}
+```
+
+### Unit Test
+All the functions should be tested corresponding.
+TBD
diff --git a/doc/design/operator_kernel_type.md b/doc/design/operator_kernel_type.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa82e96bf79319f1a57e2ad58aa9826e57be6470
--- /dev/null
+++ b/doc/design/operator_kernel_type.md
@@ -0,0 +1,91 @@
+# Design Doc: The Keys of Operator Kernel Type
+## Problem
+An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique Kernel. Before an operator runs, an certain kernel must be chosen by a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  proto::DataType data_type_;
+};
+```
+For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
+
+It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys are not enough. We need a more complete representation of `OpKernelType`. 
+
+We often implement a kernel of an operator with some computing library in certain device(place). Please remind that computing library and device are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. 
+
+For example, Eigen library can support Nvidia GPU/AMD GPU/CPU. And MKLDNN library can support Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
+
+It's obvious that different DataTypes, like fp64/fp32/int8 will have different kernels. But the data layout of a Tensor will also lead to different implementation. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209). Data Layout should also be taken into consideration.
+
+## Solution
+
+There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  platform::Library library_;
+  proto::DataType data_type_;
+  framework::Layout layout_;
+};
+```
+
+Following is the details:
+
+### Place
+
+`Place` is defined as follows:
+
+```cpp
+typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
+```
+
+`Place` is to represent the device memory where data is locating.
+
+
+### Library
+
+One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
+
+```cpp
+enum Library { Plain, MKLDNN, CUDNN };
+```
+
+We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
+A library usually has a corresponding `DeviceContext` which contains some handles needed by computation. Fluid now have two default DeviceContexts in CPU and CUDA, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains a Eigen library handle and `CDUADeviceContext` contains a Eigen library handle and cuBLAS handle.
+
+If we want to support new Library, a new enumerator need to be added to `Library` and a new corresponding `LibraryDeviceContext` will be created.
+
+
+### DataType
+
+
+`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
+
+### Layout
+
+Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
+
+Different layout leads to different implementation of operator kernel. There are mainly 4 principles we have to follow to support layout in our fluid framework.
+
+- We take layout as a data member of Tensor. Layout is actually a enum variable. If fluid is built with MKLDNN, then, the memory format in MKLDNN will be added into this enum variable too.
+
+- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout of generating data. Of course, we can have some default layout, like NCHW.
+
+- The inference of Layout is at run-time, not compile-time.
+
+- Every operator have to implement different kernels for different layouts. Let's take MKLDNN as an example, if we want to implement a MKLDNN convolution operator, we have to realize all the kernels for different layout, list at [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to do registering kernels for MKLDNN operators.
+
+`Layout` is also defined as a enum variable:
+
+```cpp
+enum Layout {
+  kNCHW,
+  kNHWC,
+#ifdef PADDLE_WITH_MKLDNN
+  knChw8c
+  ...
+#endif
+};
+```
diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md
index 202b4b65103c0b7c536a9cb466c4120ce134d8c3..691081c268b848811bf5ee6d6a41edfe0f47eec0 100644
--- a/doc/design/optimizer.md
+++ b/doc/design/optimizer.md
@@ -79,7 +79,7 @@ class Optimizer(object):
     def minimize(self, loss, parameter_list):
         """Add operations to minimize `loss` by updating `parameter_list`.
 
-        This method combines interface `append_backward_ops()` and
+        This method combines interface `append_backward()` and
         `create_optimization_pass()` into one.
         """
         params_grads = self.create_backward_pass(loss, parameter_list)
diff --git a/doc/design/refactor/multi_cpu.md b/doc/design/refactor/multi_cpu.md
new file mode 100644
index 0000000000000000000000000000000000000000..a8d8ee0422acc84835170a44eb83f9b5f0c6bb40
--- /dev/null
+++ b/doc/design/refactor/multi_cpu.md
@@ -0,0 +1,43 @@
+# Design Doc: Execute the Program with Multi CPU
+
+## Abstract
+
+This Design Doc propose an approach to make the user-defined Op graph
+running with multi-CPU, we will use an auto transpiler to convert the user-defined
+Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
+
+## Transpiler
+
+<img src="src/multi-threads/single-thread@3x.png" width="300">
+
+After converted:
+
+<img src="src/multi-threads/multi-threads@3x.png" width="1000">
+
+## Implement
+
+- `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph
+  which would be executed with multi-threads.
+- `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait`
+  for the atomic counter become `0`:
+  ```cpp
+  BlockingCounter bc(thread_count);
+  for (int i = 0; i < thread_count; ++i) {
+    thread_pool->Start([&bc] {bc.DecrementCount(); })
+  }
+  bc.Wait();
+  ```
+- `ParallelDo` Operator
+  - Initialize a thread pool which is a Singleton.
+  - Use a block id as the input, and create run the specify Block on independent scope
+    with multi-threads.
+  - Initialize a `BlockingCounter` instance and wait until all threads are done.
+- `Split` Operator will split the Input Tensor into a TensorArray.
+- `Merge` merge all the gradients which calculated in different threads
+  with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`.
+
+## TODO
+
+- Improve the optimizer stage with multi-threads, since we could
+  assign the parameters to the different threads and execute
+  optimizer with multi-threads.
diff --git a/doc/design/refactor/src/multi-threads.graffle b/doc/design/refactor/src/multi-threads.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..e71173715fff92a0a933d0c7d83599ba948552c6
Binary files /dev/null and b/doc/design/refactor/src/multi-threads.graffle differ
diff --git a/doc/design/refactor/src/multi-threads/multi-threads@3x.png b/doc/design/refactor/src/multi-threads/multi-threads@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..e40a869987dbbf5019d4cb03c1dab55b74d6c9f9
Binary files /dev/null and b/doc/design/refactor/src/multi-threads/multi-threads@3x.png differ
diff --git a/doc/design/refactor/src/multi-threads/single-thread@3x.png b/doc/design/refactor/src/multi-threads/single-thread@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..4083aebfdd45af5fbac25fa2c4176bc08c3cb44a
Binary files /dev/null and b/doc/design/refactor/src/multi-threads/single-thread@3x.png differ
diff --git a/doc/design/support_new_device.md b/doc/design/support_new_device.md
index fd23dc211a35fdc9d87bc9233fcf4e90254da748..f54b2b3694cc2a8f1d892792fd4d39a0484dc750 100644
--- a/doc/design/support_new_device.md
+++ b/doc/design/support_new_device.md
@@ -25,13 +25,14 @@ There are mainly three parts that we have to consider while integrating a new de
 
 ### Place and DeviceContext
 
+Please remind that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
 
 #### Place
-Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent different devices and computing libraries. There are inheritance relationships between different kinds of `Place`.
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add corresponding `DevicePlace`.
 
 ```
-        |   CPUPlace   --> MKLDNNPlace
-Place --|   CUDAPlace  --> CUDNNPlace
+        |   CPUPlace
+Place --|   CUDAPlace
         |   FPGAPlace
 ```
 
@@ -43,7 +44,7 @@ typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;
 
 #### DeviceContext
 
-Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different hardwares, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
+Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
 
 
 ```
@@ -106,7 +107,7 @@ template <typename Place>
 size_t Used(Place place);
 ```
 
-To implementing these interfaces, we have to implement MemoryAllocator for different Devices
+To implement these interfaces, we have to implement MemoryAllocator for different Devices.
 
 
 #### Tensor
@@ -243,6 +244,7 @@ REGISTER_OP_CUDA_KERNEL(
 Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
 
 
-We will discuss how to implement an efficient OpKernel switch policy. 
+For more details, please refer to following docs:
 
-- TBD
+- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md)
+- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md)
diff --git a/doc/design/switch_kernel.md b/doc/design/switch_kernel.md
new file mode 100644
index 0000000000000000000000000000000000000000..1846e5d9f99dd433b44ac6b5ae52893ec8f0d451
--- /dev/null
+++ b/doc/design/switch_kernel.md
@@ -0,0 +1,66 @@
+## Background
+Every operator has many kernels because there are multiple data types, places, data layout that Fluid supports. We use the `KernelType` to describe kernel types that operators can hold. 
+
+The `KernelType` is as follows.
+
+```
+struct KernelType {
+  Place place_;
+  DataType data_type_;
+  LayoutType layout_;
+};
+```
+
+The `place_` is a descriptor of the device and the computational library, e.g., `MKLDNNPlace`, `CUDAPlace`.
+
+The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float`/`double`.
+
+The `layout` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
+
+## Problem
+
+We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations.
+
+1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel.
+2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
+3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
+
+Problems under these situations are similar. We can formalise this problem as follow.
+
+We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
+
+## Solution
+
+It is clearly that transforming inputs of an operator toadapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
+
+We can infer a kernel type from the inputs of an operators. We let this kernel type as `actual kernel type`, which means this kernel type is the actually kernel type that operator should be performed.
+
+We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
+
+We transform the input data from `actual` to `expect` if the expect kernel type is not as same as actual kernel type.
+
+The algorithm is described as follow
+
+```cpp
+using DataTransformationFN = std::function<void(const Tensor& in, Tensor* out)>;
+using KernelTypePair = std::pair<KernelType, KernelType>;
+
+map<KernelTypePair, DataTransformationFN> g_data_transformation_;
+
+void OpWithKernel::Run() {
+  vec<Tensor> inputs = ...
+  auto actual_kernel_type = GetActualKernelType(inputs);
+  
+  // The expected kernel type is related to actual kernel type.
+  // For the most operators, the expected kernel type is as same as
+  // actual kernel type.
+  //
+  // So we pass `actual_kernel_type` as a parameter of 
+  // GetExpectedKernelType
+  auto expect_kernel_type = GetExpectedKernelType(actual_kernel_type);
+  
+  auto trans = g_data_transformation_[{actual_kernel_type, expect_kernel_type}];
+  
+  kernel.run(trans(inputs));
+}
+```
diff --git a/doc/faq/build_and_install/index_cn.rst b/doc/faq/build_and_install/index_cn.rst
index a2bdeead7841393fdfe90c78e5b91d9e61678a24..ed8a0c7e87da133138ecfc7ba6a8217d58b8f71d 100644
--- a/doc/faq/build_and_install/index_cn.rst
+++ b/doc/faq/build_and_install/index_cn.rst
@@ -109,3 +109,31 @@ PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二
 解决办法是：
 
 * 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包，使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面，单元测试会引用site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。同时，即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
+
+8. 下载MKLML库失败
+------------------
+
+..  code-block:: bash
+
+    make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] 错误 4
+    make[1]: *** [CMakeFiles/extern_mklml.dir/all] 错误 2
+    make[1]: *** 正在等待未完成的任务....
+
+原因：网速或SSL链接原因，导致MKLML库下载不成功。
+
+解决办法是：手动下载并安装，具体步骤如下。
+
+..  code-block:: bash
+
+    // 1. 进入对应的目录
+    cd build/third_party/mklml/src/extern_mklml
+
+    // 2. 查看包的大小， 正常情况下是75M，如果小于75M，即下载失败：
+    du -sh mklml_lnx_2018.0.1.20171007.tgz
+
+    // 3. 手动下载且解压缩，并手动生成download成功标签：
+    wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz 
+    tar zxf mklml_lnx_2018.0.1.20171007.tgz
+    touch ../extern_mklml-stamp/extern_mklml-download
+
+    // 4. 接着编译即可
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst
index c875c807b8ab2e420dec189ef32d41533f58fa6d..41ac07ca5674d2c121baba77c58226ad328cd681 100644
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -70,13 +70,13 @@ PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其
    :header: "依赖", "版本", "说明"
    :widths: 10, 15, 30
 
-   "CMake", ">=3.5", ""
+   "CMake", ">=3.2", ""
    "GCC", "4.8.2", "推荐使用CentOS的devtools2"
-   "Python", "2.7.x", "依赖libpython2.7.so"
-   "pip", ">=9.0", ""
-   "numpy", "", ""
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
    "SWIG", ">=2.0", ""
-   "Go", ">=1.8", "可选"
+   "Go", ">=1.8", "可选"
 
 
 .. _build_options:
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst
index f194f84ce7c961bb8644d7c077a7c71730220ea2..92211aee8c3bc0ae6e1a38311d40ddf92117cac7 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -76,13 +76,13 @@ will be downloaded automatically.
    :header: "Dependency", "Version", "Description"
    :widths: 10, 15, 30
 
-   "CMake", ">=3.5", ""
+   "CMake", ">=3.2", ""
    "GCC", "4.8.2", "Recommend devtools2 for CentOS"
-   "Python", "2.7.x", "Need libpython2.7.so"
-   "pip", ">=9.0", ""
-   "numpy", "", ""
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
    "SWIG", ">=2.0", ""
-   "Go", ">=1.8", "Optional"
+   "Go", ">=1.8", "Optional"
 
 
 .. _build_options:
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 1eb06e4182d40c3be20d71e37b34009905eaf9d6..fa1b6a372728ccac128d2e6e79a6514b8884ea3f 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -128,7 +128,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
 
 AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
 是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
-`编译 <./build_from_source_cn.rst>`_ PaddlePaddle为no-avx版本。
+`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。
 
 以下指令能检查Linux电脑是否支持AVX：
 
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 5a46c598f2248c7912169a9e77b16851230c1d2e..06012bf65e75c32957516f6b7f62e09480871b84 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -137,7 +137,7 @@ GPU driver installed before move on.
 AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
 The latest PaddlePaddle Docker image turns AVX on by default, so, if your
 computer doesn't support AVX, you'll probably need to
-`build <./build_from_source_en.rst>`_ with :code:`WITH_AVX=OFF`.
+`build <./build_from_source_en.html>`_ with :code:`WITH_AVX=OFF`.
 
 The following command will tell you whether your computer supports AVX.
 
diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst
index b270e2c2f0b0cbfd6fb4b9b0750d207952f84d76..a4587f82a984acf243f49834e707fcd66d5b1252 100644
--- a/doc/getstarted/build_and_install/pip_install_cn.rst
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@@ -37,11 +37,11 @@ PaddlePaddle可以使用常用的Python包管理工具
     :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API"
     :widths: 1, 3, 3, 3
 
-    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
 
 .. _pip_dependency:
 
diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst
index 70f601a11c610e0a2b5dcc8b73d2c3ea19e195e1..55e31560a0f5087ab69966a6281c6c8573c04204 100644
--- a/doc/getstarted/build_and_install/pip_install_en.rst
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@@ -40,11 +40,11 @@ If the links below shows up the login form, just click "Log in as guest" to star
     :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API"
     :widths: 1, 3, 3, 3
 
-    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
 
 .. _pip_dependency:
 
diff --git a/doc/getstarted/concepts/src/infer.py b/doc/getstarted/concepts/src/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cc58dfee0bd6dade0340b4fd0ee1adb49ffebf6
--- /dev/null
+++ b/doc/getstarted/concepts/src/infer.py
@@ -0,0 +1,18 @@
+import paddle.v2 as paddle
+import numpy as np
+
+paddle.init(use_gpu=False)
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
+y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+# loading the model which generated by training
+with open('params_pass_90.tar', 'r') as f:
+    parameters = paddle.parameters.Parameters.from_tar(f)
+
+# Input multiple sets of data，Output the infer result in a array.
+i = [[[1, 2]], [[3, 4]], [[5, 6]]]
+print paddle.infer(output_layer=y_predict, parameters=parameters, input=i)
+# Will print:
+# [[ -3.24491572]
+#  [ -6.94668722]
+#  [-10.64845848]]
diff --git a/doc/getstarted/concepts/src/train.py b/doc/getstarted/concepts/src/train.py
index 8aceb23406a476f08639cc6223cdf730b728a705..4bccbfca3c70c12aec564e2cae3b8ca174b68777 100644
--- a/doc/getstarted/concepts/src/train.py
+++ b/doc/getstarted/concepts/src/train.py
@@ -26,6 +26,11 @@ def event_handler(event):
         if event.batch_id % 1 == 0:
             print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id,
                                                   event.cost)
+    # product model every 10 pass
+    if isinstance(event, paddle.event.EndPass):
+        if event.pass_id % 10 == 0:
+            with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
 
 
 # define training dataset reader
diff --git a/doc/getstarted/concepts/use_concepts_cn.rst b/doc/getstarted/concepts/use_concepts_cn.rst
index c243083794bb3c4659242de99b3b2715af9d7c24..e695ff283e2e806377a51c559b37e8068360a4ff 100644
--- a/doc/getstarted/concepts/use_concepts_cn.rst
+++ b/doc/getstarted/concepts/use_concepts_cn.rst
@@ -147,4 +147,9 @@ PaddlePaddle支持不同类型的输入数据，主要包括四种类型，和
 ..  literalinclude:: src/train.py
     :linenos:
 
+使用以上训练好的模型进行预测，取其中一个模型params_pass_90.tar，输入需要预测的向量组，然后打印输出：
+
+..  literalinclude:: src/infer.py
+    :linenos:
+
 有关线性回归的实际应用，可以参考PaddlePaddle book的 `第一章节 <http://book.paddlepaddle.org/index.html>`_。
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index 757a5840bca4c8028e362789ec95bb03d261d2c1..3109d72001f13a38a93b9ca39d3f8525c8cea9f1 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -53,7 +53,7 @@ Kernel实现       | CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU
 ```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor), 2D tensor of size (M x K)");
     AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
@@ -82,7 +82,7 @@ The equation is: Out = X * Y
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of scale operator.").NotInGradient();
     AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md
index fe86936bc12cc2fb88d653429e250f71a478dfb6..7175d8370d6ce08c6d502eb42b8e53252db89bbb 100644
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@@ -50,7 +50,7 @@ First, define `ProtoMaker` to describe the Operator's input, output, and additio
 ```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor), 2D tensor of size (M x K)");
     AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
@@ -79,7 +79,7 @@ An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/de
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of scale operator.").NotInGradient();
     AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index 991b9e2596a3b499846b963152c838d66260265d..ccd909770253bb85dbc8a5a2560594076c2f68b0 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -9,9 +9,6 @@
 
   usage/cmd_parameter/index_cn.rst
   usage/cluster/cluster_train_cn.md
-  usage/k8s/k8s_basis_cn.md
-  usage/k8s/k8s_cn.md
-  usage/k8s/k8s_distributed_cn.md
 
 开发标准
 --------
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 61bf25ccd12eeedffc747fdd4ce84fa4adde07ee..6d1bf7dfc003da6de31410ee0a7959233adfaf76 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -9,8 +9,6 @@ Usage
 
   usage/cmd_parameter/index_en.rst
   usage/cluster/cluster_train_en.md
-  usage/k8s/k8s_en.md
-  usage/k8s/k8s_aws_en.md
 
 Development
 ------------
diff --git a/doc/howto/read_source.md b/doc/howto/read_source.md
index 383acb0c8251043c3c6bbf309d2e07bf0074cd4f..e4211abb3be9cace80bc14dbe3db3e0a31221dd0 100644
--- a/doc/howto/read_source.md
+++ b/doc/howto/read_source.md
@@ -6,10 +6,10 @@ Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework
 
 Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators
 
-Optimizer: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer
-
 Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory
 
+Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform
+
 # Compile Time
 
 The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto).
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
index 2e98b3de3fe2284375f87e883ff4bac19255dbeb..659bae9c0ceaf2fb2df8446b9d406a822a9df0ea 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -1,25 +1,8 @@
-# PaddlePaddle分布式训练
-
-* [概述](#概述)
-* [环境准备](#环境准备)
-* [启动参数说明](#启动参数说明)
-  * [启动参数服务器](#启动参数服务器)
-  * [启动计算节点](#启动计算节点)
-  * [准备数据集](#准备数据集)
-  * [准备训练程序](#准备训练程序)
-* [使用分布式计算平台或工具](#使用分布式计算平台或工具)
-  * [使用Fabric启动集群作业](#使用fabric启动集群作业)
-     * [准备一个Linux集群](#准备一个linux集群)
-     * [启动集群作业](#启动集群作业)
-     * [终止集群作业](#终止集群作业)
-     * [检查集群训练结果](#检查集群训练结果)
-     * [检查模型输出](#检查模型输出)
-  * [在OpenMPI集群中提交训练作业](#在openmpi集群中提交训练作业)
-     * [准备OpenMPI集群](#准备OpenMPI集群)
-     * [启动集群作业](#启动集群作业-1)
-  * [在Kubernetes集群中提交训练作业](#在kubernetes集群中提交训练作业)
+# 分布式训练
+
 
 ## 概述
+
 本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：
 
 <img src="https://user-images.githubusercontent.com/13348433/31772175-5f419eca-b511-11e7-9db7-5231fe3d9ccb.png" width="500">
@@ -32,10 +15,11 @@
 
 在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。
 
+
 ## 环境准备
 
 1. 准备您的计算集群。计算集群通常由一组（几台到几千台规模）的Linux服务器组成。服务器之间可以通过局域网（LAN）联通，每台服务器具有集群中唯一的IP地址（或者可被DNS解析的主机名）。集群中的每台计算机通常被成为一个“节点”。
-1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install)的多种安装方式。我们推荐使用[Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)安装方式来快速安装PaddlePaddle。
+1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
 
 安装完成之后，执行下面的命令可以查看已经安装的版本（docker安装方式可以进入docker容器执行：`docker run -it paddlepaddle/paddle:[tag] /bin/bash`）：
 ```bash
@@ -63,12 +47,12 @@ $ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradie
 $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
 ```
 
-| 参数  | 是否必选 | 默认值 | 说明 |
-| ------------- | ------------- | ------------- | ------------- |
-| port  | 必选 | 7164 | pserver监听的起始端口，根据ports_num决定<br>总端口个数，从起始端口监听多个端口用于通信  |
-| ports_num  | 必选 | 1 | 监听的端口个数  |
-| ports_num_for_sparse  | 必选 | 1 | 用于稀疏类型参数通信的端口个数  |
-| num_gradient_servers  | 必选 | 1 | 当前训练任务pserver总数 |
+参数说明
+
+- port：**必选，默认7164**，pserver监听的起始端口，根据ports_num决定总端口个数，从起始端口监听多个端口用于通信
+- ports_num：**必选，默认1**，监听的端口个数
+- ports_num_for_sparse：**必选，默认1**，用于稀疏类型参数通信的端口个数
+- num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
 
 ### 启动计算节点
 执行以下命令启动使用python编写的trainer程序（文件名为任意文件名，如train.py）
@@ -105,16 +89,16 @@ paddle.init(
         pservers="127.0.0.1")
 ```
 
-| 参数  | 是否必选 | 默认 | 说明 |
-| ------------- | ------------- | ------------- | ------------- |
-| use_gpu  | 可选 | False | 是否启用GPU训练 |
-| trainer_count  | 必选 | 1 | 当前训练任务trainer总个数 |
-| port  | 必选 | 7164 | 连接到pserver的端口  |
-| ports_num  | 必选 | 1 | 连接到pserver的端口个数  |
-| ports_num_for_sparse  | 必选 | 1 | 和pserver之间用于稀疏类型参数通信的端口个数  |
-| num_gradient_servers  | 必选 | 1 | 当前训练任务pserver总数 |
-| trainer_id  | 必选 | 0 | 每个trainer的唯一ID，从0开始的整数 |
-| pservers  | 必选 | 127.0.0.1 | 当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开 |
+参数说明
+
+- use_gpu： **可选，默认False**，是否启用GPU训练
+- trainer_count：**必选，默认1**，当前训练任务trainer总个数
+- port：**必选，默认7164**，连接到pserver的端口
+- ports_num：**必选，默认1**，连接到pserver的端口个数
+- ports_num_for_sparse：**必选，默认1**，和pserver之间用于稀疏类型参数通信的端口个数
+- num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
+- trainer_id：**必选，默认0**，每个trainer的唯一ID，从0开始的整数
+- pservers：**必选，默认127.0.0.1**，当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开
 
 
 ### 准备数据集
@@ -171,7 +155,7 @@ test.txt-00002
 
 - `my_lib.py`：会被`train.py`调用的一些用户定义的库函数，比如PIL库等。
 - `word_dict.pickle`：在`train.py`中会使用到的字典数据文件。
-- `train.py`：训练程序，代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py)。***注意：*** 对于本样例代码，在使用不同的分布式计算平台时，您可能需要修改`train.py`开头的部分（如下），以便获得训练数据的位置和获取环境变量配置：
+- `train.py`：训练程序，代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py)。***注意：*** 对于本样例代码，在使用不同的分布式计算平台时，您可能需要修改`train.py`开头的部分（如下），以便获得训练数据的位置和获取环境变量配置：
 
   ```python
   cluster_train_file = "./train_data_dir/train/train.txt"
@@ -195,91 +179,10 @@ PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务
 
 在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。
 
-### 使用Fabric启动集群作业
-
-#### 准备一个Linux集群
-可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下，执行`kubectl -f ssh_servers.yaml`启动一个测试集群，并使用`kubectl get po -o wide`获得这些节点的IP地址。
-
-#### 启动集群作业
-
-`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
-
-`paddle.py` 为方便作业启动提供了两个独特的命令选项。
-
--  `job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
--  `job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
-
-`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务，只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
-```
-sh run.sh
-```
-
-集群作业将会在几秒后启动。
-
-#### 终止集群作业
-`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
-
-#### 检查集群训练结果
-详细信息请检查 $workspace/log 里的日志，每一个节点都有相同的日志结构。
-
-`paddle_trainer.INFO`
-提供几乎所有训练的内部输出日志，与本地训练相同。这里检验运行时间模型的收敛。
-
-`paddle_pserver2.INFO`
-提供 pserver 运行日志，有助于诊断分布式错误。
-
-`server.log`
-提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
-
-`train.log`
-提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
-
-#### 检查模型输出
-运行完成后，模型文件将被写入节点 0 的 `output` 目录中。
-工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
-
-### 在OpenMPI集群中提交训练作业
-
-#### 准备OpenMPI集群
-
-执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点：
-
-```bash
-paddle/scripts/cluster_train_v2/openmpi/docker_cluster
-kubectl create -f head.yaml
-kubectl create -f mpi-nodes.yaml
-```
-
-然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。
-
-#### 启动集群作业
-
-您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务：
-
-```bash
-# 获得head和node节点的IP地址
-kubectl get po -o wide
-# 将node节点的IP地址保存到machines文件中
-kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
-# 拷贝必要的文件到head节点
-scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
-# ssh 登录到head节点
-ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
-# --------------- 以下操作均在head节点中执行 ---------------
-# 准备训练数据
-python prepare.py
-# 拷贝训练程序和字典文件到每台MPI节点
-cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
-# 创建日志目录
-mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
-# 拷贝训练数据到各自的节点
-scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
-scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
-scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
-# 启动训练任务
-mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
-```
-
-### 在Kubernetes集群中提交训练作业
+## 在不同集群中运行
 
-此部分的使用方法可以参考[here](../k8s/k8s_distributed_cn.md)。
+  - [fabric集群](fabric_cn.md)
+  - [openmpi集群](openmpi_cn.md)
+  - [kubernetes单机](k8s_cn.md)
+  - [kubernetes distributed分布式](k8s_distributed_cn.md)
+  - [AWS上运行kubernetes集群训练](k8s_aws_cn.md)
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
index baa97c0c02ae490fff8587071bd2d4adfb5325e3..915405ca5b446981515e301ca4b7ee065a82a9ff 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -1,23 +1,4 @@
-# PaddlePaddle Distributed Training
-
-* [Introduction](#introduction)
-* [Preparations](#preparations)
-* [Command-line arguments](#command-line-arguments)
-   * [Starting parameter server](#starting-parameter-server)
-   * [Starting trainer](#starting-trainer)
-   * [Prepare Training Dataset](#prepare-training-dataset)
-   * [Prepare Training program](#prepare-training-program)
-* [Use cluster platforms or cluster management tools](#use-cluster-platforms-or-cluster-management-tools)
-   * [Cluster Training Using Fabric](#cluster-training-using-fabric)
-      * [Prepare a Linux cluster](#prepare-a-linux-cluster)
-      * [Launching Cluster Job](#launching-cluster-job)
-      * [Kill Cluster Job](#kill-cluster-job)
-      * [Check Cluster Training Result](#check-cluster-training-result)
-      * [Check Model Output](#check-model-output)
-   * [Cluster Training Using OpenMPI](#cluster-training-using-openmpi)
-      * [Prepare an OpenMPI cluster](#prepare-an-openmpi-cluster)
-      * [Launching Cluster Job](#launching-cluster-job-1)
-   * [Cluster Training Using Kubernetes](#cluster-training-using-kubernetes)
+# Distributed Training
 
 ## Introduction
 
@@ -35,7 +16,7 @@ When training with synchronize SGD, PaddlePaddle uses an internal "synchronize b
 
 ## Preparations
 1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
-2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install) document. We strongly recommend using [Docker installation](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).
+2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
 
 After installation, you can check the version by typing the below command (run a docker container  if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
 
@@ -67,12 +48,12 @@ If you wish to run parameter servers in background, and save a log file, you can
 $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
 ```
 
-| param  | required | default | description |
-| ------------- | ------------- | ------------- | ------------- |
-| port  | required | 7164 | port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput |
-| ports_num  | required | 1 | total number of ports will listen on  |
-| ports_num_for_sparse  | required | 1 | number of ports which serves sparse parameter update  |
-| num_gradient_servers  | required | 1 | total number of gradient servers |
+Parameter Description
+
+- port: **required, default 7164**, port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput.
+- ports_num: **required, default 1**, total number of ports will listen on.
+- ports_num_for_sparse: **required, default 1**, number of ports which serves sparse parameter update.
+- num_gradient_servers: **required, default 1**, total number of gradient servers.
 
 ### Starting trainer
 Type the command below to start the trainer(name the file whatever you want, like "train.py")
@@ -111,16 +92,16 @@ paddle.init(
         pservers="127.0.0.1")
 ```
 
-| param  | required | default | description |
-| ------------- | ------------- | ------------- | ------------- |
-| use_gpu  | optional | False | set to "True" to enable GPU training |
-| trainer_count  | required | 1 | total count of trainers in the training job |
-| port  | required | 7164 | port to connect to parameter server  |
-| ports_num  | required | 1 | number of ports for communication |
-| ports_num_for_sparse  | required | 1 | number of ports for sparse type caculation |
-| num_gradient_servers  | required | 1 | total number of gradient server |
-| trainer_id  | required | 0 | ID for every trainer, start from 0 |
-| pservers  | required | 127.0.0.1 | list of IPs of parameter servers, separated by "," |
+Parameter Description
+
+- use_gpu: **optional, default False**, set to "True" to enable GPU training.
+- trainer_count: **required, default 1**, total count of trainers in the training job.
+- port: **required, default 7164**, port to connect to parameter server.
+- ports_num: **required, default 1**, number of ports for communication.
+- ports_num_for_sparse: **required, default 1**, number of ports for sparse type caculation.
+- num_gradient_servers: **required, default 1**, total number of gradient server.
+- trainer_id: **required, default 0**, ID for every trainer, start from 0.
+- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
 
 ### Prepare Training Dataset
 
@@ -178,7 +159,7 @@ Your workspace may looks like:
 
 - `my_lib.py`: user defined libraries, like PIL libs. This is optional.
 - `word_dict.pickle`: dict file for training word embeding.
-- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables:
+- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables:
 
   ```python
   cluster_train_file = "./train_data_dir/train/train.txt"
@@ -202,92 +183,9 @@ We'll introduce cluster job management on these platforms. The examples can be f
 
 These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
 
-### Cluster Training Using Fabric
-
-#### Prepare a Linux cluster
-
-Run `kubectl -f ssh_servers.yaml` under the directory:  `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
-
-#### Launching Cluster Job
-`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
-
-`paddle.py`provides two distinguished command option for easy job launching.
-
-- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
-- `job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
-dispatch latency.
-
-`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
-```
-sh run.sh
-```
-
-The cluster Job will start in several seconds.
-
-#### Kill Cluster Job
-`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
-
-#### Check Cluster Training Result
-Check log in $workspace/log for details, each node owns same log structure.
-
-`paddle_trainer.INFO`
-It provides almost all internal output log for training,  same as local training. Check runtime model convergence here.
-
-`paddle_pserver2.INFO`
-It provides parameter server running log, which could help to diagnose distributed error.
-
-`server.log`
-It provides stderr and stdout of parameter server process. Check error log if training crashes.
-
-`train.log`
-It provides stderr and stdout of trainer process. Check error log if training crashes.
-
-#### Check Model Output
-After one pass finished, model files will be written in `output` directory in node 0.
-`nodefile` in workspace indicates the node id of current cluster job.
-
-### Cluster Training Using OpenMPI
-
-#### Prepare an OpenMPI cluster
-
-Run the following command to start a 3-node MPI cluster and one "head" node.
-
-```bash
-cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster
-kubectl create -f head.yaml
-kubectl create -f mpi-nodes.yaml
-```
-
-Then you can log in to every OpenMPI node using ssh without input any passwords.
-
-#### Launching Cluster Job
-
-Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
-
-```bash
-# find out node IP addresses
-kubectl get po -o wide
-# generate a "machines" file containing node IP addresses
-kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
-# copy necessary files onto "head" node
-scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
-# login to head node using ssh
-ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
-# --------------- in head node ---------------
-# prepare training data
-python prepare.py
-# copy training data and dict file to MPI nodes
-cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
-# creat a directory for storing log files
-mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
-# copy training data to every node
-scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
-scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
-scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
-# start the job
-mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
-```
-
-### Cluster Training Using Kubernetes
+## Use different clusters
 
-The details can be found [here](../k8s/k8s_cn.md)
+  - [fabric](fabric_en.md)
+  - [openmpi](openmpi_en.md)
+  - [kubernetes](k8s_en.md)
+  - [kubernetes on AWS](k8s_aws_en.md)
diff --git a/doc/howto/usage/cluster/fabric_cn.md b/doc/howto/usage/cluster/fabric_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..0385e401b399a51fad112e604dc56cb2f84c0a4b
--- /dev/null
+++ b/doc/howto/usage/cluster/fabric_cn.md
@@ -0,0 +1,42 @@
+# 使用fabric启动集群训练
+
+## 准备一个Linux集群
+可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下，执行`kubectl -f ssh_servers.yaml`启动一个测试集群，并使用`kubectl get po -o wide`获得这些节点的IP地址。
+
+## 启动集群作业
+
+`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
+
+`paddle.py` 为方便作业启动提供了两个独特的命令选项。
+
+-  `job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
+-  `job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
+
+`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务，只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
+```
+sh run.sh
+```
+
+集群作业将会在几秒后启动。
+
+## 终止集群作业
+`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
+
+## 检查集群训练结果
+详细信息请检查 $workspace/log 里的日志，每一个节点都有相同的日志结构。
+
+`paddle_trainer.INFO`
+提供几乎所有训练的内部输出日志，与本地训练相同。这里检验运行时间模型的收敛。
+
+`paddle_pserver2.INFO`
+提供 pserver 运行日志，有助于诊断分布式错误。
+
+`server.log`
+提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+`train.log`
+提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+## 检查模型输出
+运行完成后，模型文件将被写入节点 0 的 `output` 目录中。
+工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
diff --git a/doc/howto/usage/cluster/fabric_en.md b/doc/howto/usage/cluster/fabric_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..bf270d89ab8514801ca4629cf412f73257429df9
--- /dev/null
+++ b/doc/howto/usage/cluster/fabric_en.md
@@ -0,0 +1,43 @@
+# Cluster Training Using Fabric
+
+## Prepare a Linux cluster
+
+Run `kubectl -f ssh_servers.yaml` under the directory:  `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
+
+## Launching Cluster Job
+`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
+
+`paddle.py`provides two distinguished command option for easy job launching.
+
+- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
+- `job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
+dispatch latency.
+
+`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
+```
+sh run.sh
+```
+
+The cluster Job will start in several seconds.
+
+## Kill Cluster Job
+`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
+
+## Check Cluster Training Result
+Check log in $workspace/log for details, each node owns same log structure.
+
+`paddle_trainer.INFO`
+It provides almost all internal output log for training,  same as local training. Check runtime model convergence here.
+
+`paddle_pserver2.INFO`
+It provides parameter server running log, which could help to diagnose distributed error.
+
+`server.log`
+It provides stderr and stdout of parameter server process. Check error log if training crashes.
+
+`train.log`
+It provides stderr and stdout of trainer process. Check error log if training crashes.
+
+## Check Model Output
+After one pass finished, model files will be written in `output` directory in node 0.
+`nodefile` in workspace indicates the node id of current cluster job.
diff --git a/doc/howto/usage/cluster/k8s_aws_cn.md b/doc/howto/usage/cluster/k8s_aws_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..c44cd9a731bed7067cdf19aa2f714abdce6c736a
--- /dev/null
+++ b/doc/howto/usage/cluster/k8s_aws_cn.md
@@ -0,0 +1 @@
+k8s_aws_en.md
\ No newline at end of file
diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/cluster/k8s_aws_en.md
similarity index 98%
rename from doc/howto/usage/k8s/k8s_aws_en.md
rename to doc/howto/usage/cluster/k8s_aws_en.md
index ce72b0803818d5bf0c18753c421848cf2fc1b668..0dfa8237a3fa2c9c3ee11e873c9fbbed3cd6018f 100644
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/cluster/k8s_aws_en.md
@@ -493,7 +493,7 @@ spec:
     spec:
       containers:
       - name: paddle-data
-        image: paddledev/paddle-tutorial:k8s_data
+        image: paddlepaddle/paddle-tutorial:k8s_data
         imagePullPolicy: Always
         volumeMounts:
         - mountPath: "/efs"
@@ -522,7 +522,7 @@ NAME          DESIRED   SUCCESSFUL   AGE
 paddle-data   1         1            6m
 ```
 
-Data preparation is done by docker image `paddledev/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code.
+Data preparation is done by docker image `paddlepaddle/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code.
 
 #### Start Training
 
@@ -545,7 +545,7 @@ spec:
           claimName: efsvol
       containers:
       - name: trainer
-        image: paddledev/paddle-tutorial:k8s_train
+        image: paddlepaddle/paddle-tutorial:k8s_train
         command: ["bin/bash",  "-c", "/root/start.sh"]
         env:
         - name: JOB_NAME
@@ -617,7 +617,7 @@ kubectl --kubeconfig=kubeconfig log -f POD_NAME
 
 Run `kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job` to check training job status. It will complete in around 20 minutes.
 
-The details for start `pserver` and `trainer` are hidden inside docker image `paddledev/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code.
+The details for start `pserver` and `trainer` are hidden inside docker image `paddlepaddle/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code.
 
 #### Inspect Training Output
 
diff --git a/doc/howto/usage/k8s/k8s_cn.md b/doc/howto/usage/cluster/k8s_cn.md
similarity index 83%
rename from doc/howto/usage/k8s/k8s_cn.md
rename to doc/howto/usage/cluster/k8s_cn.md
index ab07cb9cd5b135ddea82b3360720537f1dc5a801..c1a11f7165a2f9da9dd044641274447e7943a597 100644
--- a/doc/howto/usage/k8s/k8s_cn.md
+++ b/doc/howto/usage/cluster/k8s_cn.md
@@ -1,21 +1,22 @@
 # Kubernetes单机训练
 
-在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
+在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的PaddlePaddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
 
 ## 制作Docker镜像
 
-在一个功能齐全的Kubernetes机群里，通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话，一个分布式Paddle训练任务中的每个进程都可以从Ceph读取数据。在这个例子里，我们只演示一个单机作业，所以可以简化对环境的要求，把训练数据直接放在
-Paddle的Docker image里。为此，我们需要制作一个包含训练数据的Paddle镜像。
+在一个功能齐全的Kubernetes机群里，通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话，一个分布式PaddlePaddle训练任务中
+的每个进程都可以从Ceph读取数据。在这个例子里，我们只演示一个单机作业，所以可以简化对环境的要求，把训练数据直接放在
+PaddlePaddle的Docker Image里。为此，我们需要制作一个包含训练数据的PaddlePaddle镜像。
+
+PaddlePaddle的 `paddlepaddle/paddle:cpu-demo-latest` 镜像里有PaddlePaddle的源码与demo，
+（请注意，默认的PaddlePaddle生产环境镜像 `paddlepaddle/paddle:latest` 是不包括源码的，PaddlePaddle的各版本镜像可以参考
+[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)），
+下面我们使用这个镜像来下载数据到Docker Container中，并把这个包含了训练数据的Container保存为一个新的镜像。
 
-Paddle 的 [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) 
-里介绍了用Paddle源码中的脚本下载训练数据的过程。
-而 `paddledev/paddle:cpu-demo-latest` 镜像里有 Paddle 源码与demo，（ 请注意，默认的
-Paddle镜像 `paddledev/paddle:cpu-latest` 是不包括源码的, Paddle的各版本镜像可以参考 [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html) ），所以我们使用这个镜像来下载训练数据到Docker container中，然后把这个包含了训练数据的container保存为一个新的镜像。
-  
 ### 运行容器
 
 ```
-$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest
 ```
 
 ### 下载数据
@@ -103,7 +104,7 @@ spec:
       restartPolicy: Never
 ```
 
-### 创建Paddle Job
+### 创建PaddlePaddle Job
 
 使用上文创建的yaml文件创建Kubernetes Job，命令为：
 
diff --git a/doc/howto/usage/k8s/k8s_distributed_cn.md b/doc/howto/usage/cluster/k8s_distributed_cn.md
similarity index 88%
rename from doc/howto/usage/k8s/k8s_distributed_cn.md
rename to doc/howto/usage/cluster/k8s_distributed_cn.md
index a9bebf09558b06993119803458977abedbbfbdd0..167089b8074b33e3b094fa3ec8e377630cec42ac 100644
--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/cluster/k8s_distributed_cn.md
@@ -1,8 +1,6 @@
 # Kubernetes分布式训练
 
-前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
-
-有关Kubernetes相关概念以及如何搭建和配置Kubernetes集群，可以参考[k8s_basis](./k8s_basis_cn.md)。
+前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cluster/cluster_train_cn.html)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
 
 ## 整体方案
 
@@ -28,7 +26,7 @@ PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行
 - 拷贝训练文件到容器内
 - 生成`paddle pserver`与`paddle train`进程的启动参数，并且启动训练
 
-因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/src/k8s_train/Dockerfile)。
+因为官方镜像 `paddlepaddle/paddle:latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile)。
 
 ```bash
 $ cd doc/howto/usage/k8s/src/k8s_train
@@ -62,7 +60,7 @@ spec:
       hostNetwork: true
       containers:
       - name: paddle-data
-        image: paddledev/paddle-tutorial:k8s_data
+        image: paddlepaddle/paddle-tutorial:k8s_data
         imagePullPolicy: Always
         volumeMounts:
         - mountPath: "/mnt"
@@ -149,20 +147,19 @@ spec:
 
 文件中，`metadata`下的`name`表示这个job的名字。`parallelism，completions`字段表示这个job会同时开启3个PaddlePaddle节点，成功训练且退出的pod数目为3时，这个job才算成功结束。然后申明一个存储卷`jobpath`，代表宿主机目录`/home/work/mfs`，在对容器的描述`containers`字段中，将此目录挂载为容器的`/home/jobpath`目录，这样容器的`/home/jobpath`目录就成为了共享存储，放在这个目录里的文件其实是保存到了MFS上。
 
-`env`字段表示容器的环境变量，我们将`paddle`运行的一些参数通过这种方式传递到容器内。
+`env`字段表示容器的环境变量，我们将`paddle`运行的一些参数通过这种方式传递到容器内：
+
 
-环境变量 | 说明
---- | ---
-JOB_PATH | 共享存储挂在的路径
-JOB_NAME | Job的名字
-TRAIN_CONFIG_DIR | 本次训练文件所在目录，与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径
-CONF_PADDLE_NIC | `paddle pserver`进程需要的`--nics`参数，即网卡名
-CONF_PADDLE_PORT | `paddle paserver`的`--port`参数
-CONF_PADDLE_PORTS_NUM | 稠密更新的端口数量，即`--ports_num`参数
-CONF_PADDLE_PORTS_NUM_SPARSE | 稀疏更新的端口数量，即`--ports_num_for_sparse`参数
-CONF_PADDLE_GRADIENT_NUM | 训练节点数量，即`--num_gradient_servers参数`
+- JOB_PATH：共享存储挂在的路径
+- JOB_NAME：Job的名字
+- TRAIN_CONFIG_DIR：本次训练文件所在目录，与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径
+- CONF_PADDLE_NIC：`paddle pserver`进程需要的`--nics`参数，即网卡名
+- CONF_PADDLE_PORT：`paddle paserver`的`--port`参数
+- CONF_PADDLE_PORTS_NUM：稠密更新的端口数量，即`--ports_num`参数
+- CONF_PADDLE_PORTS_NUM_SPARSE：稀疏更新的端口数量，即`--ports_num_for_sparse`参数
+- CONF_PADDLE_GRADIENT_NUM：训练节点数量，即`--num_gradient_servers参数`
 
-这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/doc/ui/cmd_argument/detail_introduction.html#parameter-server-and-distributed-communication)。
+这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。
 
 编写完YAML文件后，可以使用Kubernetes的命令行工具创建job。
 
diff --git a/doc/howto/usage/k8s/k8s_en.md b/doc/howto/usage/cluster/k8s_en.md
similarity index 79%
rename from doc/howto/usage/k8s/k8s_en.md
rename to doc/howto/usage/cluster/k8s_en.md
index 0c3ab05b708e7a924577c26496b8c55126e76c62..c374f00a495d705ceddf8d3d930768ceeb93282b 100644
--- a/doc/howto/usage/k8s/k8s_en.md
+++ b/doc/howto/usage/cluster/k8s_en.md
@@ -1,18 +1,27 @@
-# Paddle On Kubernetes
+# PaddlePaddle On Kubernetes
 
->In this article, we will introduce how to run Paddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run Paddle training job on distributed cluster.
+In this article, we will introduce how to run PaddlePaddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run PaddlePaddle training job on distributed cluster.
 
 ## Build Docker Image
 
-In distributed Kubernetes cluster, we will use Ceph or other shared storage system for storing training related data so that all processes in Paddle training can retrieve data from Ceph. In this example, we will only demo training job on single machine. In order to simplify the requirement of the environment, we will directly put training data into Paddle's Docker Image, so we need to create a Paddle Docker image that already includes the training data.
+In distributed Kubernetes cluster, we will use Ceph or other distributed
+storage system for storing training related data so that all processes in
+PaddlePaddle training can retrieve data from Ceph. In this example, we will
+only demo training job on single machine. In order to simplify the requirement
+of the environment, we will directly put training data into the PaddlePaddle Docker Image,
+so we need to create a PaddlePaddle Docker image that includes the training data.
+
+The production Docker Image `paddlepaddle/paddle:cpu-demo-latest` has the PaddlePaddle
+source code and demo. (Caution: Default PaddlePaddle Docker Image `paddlepaddle/paddle:latest` doesn't include
+the source code, PaddlePaddle's different versions of Docker Image can be referred here:
+[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_en.html)),
+so we run this Docker Image and download the training data, and then commit the whole
+Container to be a new Docker Image.
 
-Paddle's [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) introduces how to download and train data by using script from Paddle's source code.
-And `paddledev/paddle:cpu-demo-latest` image has the Paddle source code and demo. (Caution: Default Paddle image `paddledev/paddle:cpu-latest` doesn't include the source code, Paddle's different versions of image can be referred here: [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html)), so we run this container and download the training data, and then commit the whole container to be a new Docker image.
-  
 ### Run Docker Container
 
 ```
-$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest
 ```
 
 ### Download Training Data
@@ -67,7 +76,7 @@ $ docker commit quick_start_data mypaddle/paddle:quickstart
 
 ## Use Kubernetes For Training
 
->We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
+We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
 
 ### Create Yaml Files
 
@@ -99,7 +108,7 @@ spec:
       restartPolicy: Never
 ```
 
-### Start Paddle Job
+### Start PaddlePaddle Job
 
 Using the above yaml file to start the Kubernetes job.
 
diff --git a/doc/howto/usage/cluster/openmpi_cn.md b/doc/howto/usage/cluster/openmpi_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..831cafdc03c6a908f31769d0467de022df42dab5
--- /dev/null
+++ b/doc/howto/usage/cluster/openmpi_cn.md
@@ -0,0 +1,41 @@
+# 在OpenMPI集群中提交训练作业
+
+## 准备OpenMPI集群
+
+执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点：
+
+```bash
+paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。
+
+## 启动集群作业
+
+您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务：
+
+```bash
+# 获得head和node节点的IP地址
+kubectl get po -o wide
+# 将node节点的IP地址保存到machines文件中
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# 拷贝必要的文件到head节点
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# ssh 登录到head节点
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- 以下操作均在head节点中执行 ---------------
+# 准备训练数据
+python prepare.py
+# 拷贝训练程序和字典文件到每台MPI节点
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# 创建日志目录
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# 拷贝训练数据到各自的节点
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# 启动训练任务
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
diff --git a/doc/howto/usage/cluster/openmpi_en.md b/doc/howto/usage/cluster/openmpi_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..09af46e25ebe1f843dc7c7be0997dc706413b65c
--- /dev/null
+++ b/doc/howto/usage/cluster/openmpi_en.md
@@ -0,0 +1,41 @@
+# Cluster Training Using OpenMPI
+
+## Prepare an OpenMPI cluster
+
+Run the following command to start a 3-node MPI cluster and one "head" node.
+
+```bash
+cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+Then you can log in to every OpenMPI node using ssh without input any passwords.
+
+## Launching Cluster Job
+
+Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
+
+```bash
+# find out node IP addresses
+kubectl get po -o wide
+# generate a "machines" file containing node IP addresses
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# copy necessary files onto "head" node
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# login to head node using ssh
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- in head node ---------------
+# prepare training data
+python prepare.py
+# copy training data and dict file to MPI nodes
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# creat a directory for storing log files
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# copy training data to every node
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# start the job
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
diff --git a/doc/howto/usage/k8s/src/Dockerfile b/doc/howto/usage/cluster/src/Dockerfile
similarity index 54%
rename from doc/howto/usage/k8s/src/Dockerfile
rename to doc/howto/usage/cluster/src/Dockerfile
index 3a73606c61432329b4cc2d2f8daadc5af8735c96..e178bf4da0f32fca9586b5b69a2c7419de5d9cb1 100644
--- a/doc/howto/usage/k8s/src/Dockerfile
+++ b/doc/howto/usage/cluster/src/Dockerfile
@@ -1,4 +1,4 @@
-FROM paddledev/paddle:cpu-latest
+FROM paddlepaddle/paddle:latest
 
 MAINTAINER zjsxzong89@gmail.com
 
diff --git a/doc/howto/usage/k8s/src/add_security_group.png b/doc/howto/usage/cluster/src/add_security_group.png
similarity index 100%
rename from doc/howto/usage/k8s/src/add_security_group.png
rename to doc/howto/usage/cluster/src/add_security_group.png
diff --git a/doc/howto/usage/k8s/src/create_efs.png b/doc/howto/usage/cluster/src/create_efs.png
similarity index 100%
rename from doc/howto/usage/k8s/src/create_efs.png
rename to doc/howto/usage/cluster/src/create_efs.png
diff --git a/doc/howto/usage/k8s/src/efs_mount.png b/doc/howto/usage/cluster/src/efs_mount.png
similarity index 100%
rename from doc/howto/usage/k8s/src/efs_mount.png
rename to doc/howto/usage/cluster/src/efs_mount.png
diff --git a/doc/howto/usage/cluster/src/k8s-paddle-arch.png b/doc/howto/usage/cluster/src/k8s-paddle-arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3800c4fe81302d35e49f7dbacb9221c4dfa5cde
Binary files /dev/null and b/doc/howto/usage/cluster/src/k8s-paddle-arch.png differ
diff --git a/doc/howto/usage/k8s/src/k8s_data/Dockerfile b/doc/howto/usage/cluster/src/k8s_data/Dockerfile
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_data/Dockerfile
rename to doc/howto/usage/cluster/src/k8s_data/Dockerfile
diff --git a/doc/howto/usage/k8s/src/k8s_data/README.md b/doc/howto/usage/cluster/src/k8s_data/README.md
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_data/README.md
rename to doc/howto/usage/cluster/src/k8s_data/README.md
diff --git a/doc/howto/usage/k8s/src/k8s_data/get_data.sh b/doc/howto/usage/cluster/src/k8s_data/get_data.sh
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_data/get_data.sh
rename to doc/howto/usage/cluster/src/k8s_data/get_data.sh
diff --git a/doc/howto/usage/k8s/src/k8s_train/Dockerfile b/doc/howto/usage/cluster/src/k8s_train/Dockerfile
similarity index 77%
rename from doc/howto/usage/k8s/src/k8s_train/Dockerfile
rename to doc/howto/usage/cluster/src/k8s_train/Dockerfile
index c0fca1f9a945921e6e8899fee2db8845e66136a1..77f021a89a70d934bf70424eaa3c6dc3f7c93a28 100644
--- a/doc/howto/usage/k8s/src/k8s_train/Dockerfile
+++ b/doc/howto/usage/cluster/src/k8s_train/Dockerfile
@@ -1,4 +1,4 @@
-FROM paddledev/paddle:cpu-latest
+FROM paddlepaddle/paddle:latest
 
 COPY start.sh /root/
 COPY start_paddle.py /root/
diff --git a/doc/howto/usage/k8s/src/k8s_train/README.md b/doc/howto/usage/cluster/src/k8s_train/README.md
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_train/README.md
rename to doc/howto/usage/cluster/src/k8s_train/README.md
diff --git a/doc/howto/usage/k8s/src/k8s_train/start.sh b/doc/howto/usage/cluster/src/k8s_train/start.sh
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_train/start.sh
rename to doc/howto/usage/cluster/src/k8s_train/start.sh
diff --git a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py b/doc/howto/usage/cluster/src/k8s_train/start_paddle.py
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_train/start_paddle.py
rename to doc/howto/usage/cluster/src/k8s_train/start_paddle.py
diff --git a/doc/howto/usage/k8s/src/managed_policy.png b/doc/howto/usage/cluster/src/managed_policy.png
similarity index 100%
rename from doc/howto/usage/k8s/src/managed_policy.png
rename to doc/howto/usage/cluster/src/managed_policy.png
diff --git a/doc/howto/usage/k8s/src/pserver_and_trainer.png b/doc/howto/usage/cluster/src/pserver_and_trainer.png
similarity index 100%
rename from doc/howto/usage/k8s/src/pserver_and_trainer.png
rename to doc/howto/usage/cluster/src/pserver_and_trainer.png
diff --git a/doc/howto/usage/k8s/src/route53_create_recordset.png b/doc/howto/usage/cluster/src/route53_create_recordset.png
similarity index 100%
rename from doc/howto/usage/k8s/src/route53_create_recordset.png
rename to doc/howto/usage/cluster/src/route53_create_recordset.png
diff --git a/doc/howto/usage/k8s/src/route53_create_zone.png b/doc/howto/usage/cluster/src/route53_create_zone.png
similarity index 100%
rename from doc/howto/usage/k8s/src/route53_create_zone.png
rename to doc/howto/usage/cluster/src/route53_create_zone.png
diff --git a/doc/howto/usage/k8s/src/worker_security_group.png b/doc/howto/usage/cluster/src/worker_security_group.png
similarity index 100%
rename from doc/howto/usage/k8s/src/worker_security_group.png
rename to doc/howto/usage/cluster/src/worker_security_group.png
diff --git a/doc/howto/usage/k8s/k8s_basis_cn.md b/doc/howto/usage/k8s/k8s_basis_cn.md
deleted file mode 100644
index 4c3dc81ed38f239c1f4a83d22b49cf57b5d16a8b..0000000000000000000000000000000000000000
--- a/doc/howto/usage/k8s/k8s_basis_cn.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# Kubernetes 简介
-
-[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统，其提供应用部署、维护、扩展机制等功能，利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行，且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)，[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)，[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前，需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识，下面先简要介绍一下本文用到的几个Kubernetes概念。
-
-- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点，这个节点可以是物理机或者虚拟机，Kubernetes集群就是由node节点与master节点组成的。
-
-- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器，pod是Kubernetes的最小调度单元，一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET，PID，IPC，UTS等Linux namespace。由于容器之间共享NET namespace，所以它们使用同一个IP地址，可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
-
-- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 描述Kubernetes上运行的作业，一次作业称为一个job，通常每个job包括一个或者多个pods，job启动后会创建这些pod并开始执行一个程序，等待这个程序执行成功并返回0则成功退出，如果执行失败，也可以配置不同的重试机制。
-
-- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷，是pod内的容器都可以访问的共享目录，也是容器与node之间共享文件的方式，因为容器内的文件都是暂时存在的，当容器因为各种原因被销毁时，其内部的文件也会随之消失。通过volume，就可以将这些文件持久化存储。Kubernetes支持多种volume，例如hostPath(宿主机目录)，gcePersistentDisk，awsElasticBlockStore等。
-
-- [*Namespaces*](https://kubernetes.io/docs/user-guide/namespaces/) 命名空间，在kubernetes中创建的所有资源对象(例如上文的pod，job)等都属于一个命名空间，在同一个命名空间中，资源对象的名字是唯一的，不同空间的资源名可以重复，命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
-
-- [*PersistentVolume*](https://kubernetes.io/docs/user-guide/persistent-volumes/): 和[*PersistentVolumeClaim*](https://kubernetes.io/docs/user-guide/persistent-volumes/#persistentvolumeclaims)结合，将外部的存储服务在Kubernetes中描述成为统一的资源形式，便于存储资源管理和Pod引用。
-
-## 部署Kubernetes集群
-
-Kubernetes提供了多种集群部署的方案，本文档内不重复介绍。这里给出集中常见的部署方法：
-
-- [*minikube*](https://kubernetes.io/docs/getting-started-guides/minikube/): 快速在本地启动一个单机的kubernetes服务器，便于本地验证和测试。
-- [*kubeadm*](http://kubernetes.io/docs/getting-started-guides/kubeadm/): 在不同操作系统，不同主机(Bare-Metal, AWS, GCE)条件下，快速部署集群。
-- [*AWS EC2*](https://kubernetes.io/docs/getting-started-guides/aws/): 在aws上快速部署集群。
-- [*Bare-Metal*](https://kubernetes.io/docs/getting-started-guides/centos/centos_manual_config/): 在物理机上手动部署。
-
-可以参考[这个表格](https://kubernetes.io/docs/getting-started-guides/#table-of-solutions)选择适合您的场景的合适方案。
-
-## 选择存储方案
-
-容器不会保留在运行时生成的数据，job或者应用程序在容器中运行时生成的数据会在容器销毁时消失。为了完成分布式机器学习训练任务，需要有一个外部的存储服务来保存训练所需数据和训练输出。
-常见的可选存储服务包括：
-
-- [*NFS*](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/nfs): 可以将磁盘上某个目录共享给网络中其他机器访问。部署和配置比较简单，可以用于小量数据的验证。不提供分布式存储，高可用，冗余等功能。NFS的部署方法可以参考[这里](http://www.tecmint.com/how-to-setup-nfs-server-in-linux/)。
-- [*GlusterFS*](http://gluster.readthedocs.io/en/latest/Quick-Start-Guide/Quickstart/): 网络分布式文件系统，可以在Kubernetes中按照[这个](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/glusterfs)例子使用。
-- [*Ceph*](http://docs.ceph.com/docs/master/): 分布式文件系统，支持rbd，POSIX API接口(ceph fs)和对象存储API，参考[这里](https://kubernetes.io/docs/user-guide/volumes/#rbd)。
-- [*MooseFS*](https://moosefs.com/documentation.html): 一个分布式的存储系统。需要先挂载到服务器Node上再通过kubernetes hostPath Volume挂载到容器中。
-
-## 配置kubectl
-
-### 安装kubectl
-```
-# OS X
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/darwin/amd64/kubectl
-
-# Linux
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
-
-# Windows
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/windows/amd64/kubectl.exe
-```
-
-### 配置kubectl访问你的kubernetes集群
-
-编辑`~/.kube/config`这个配置文件，修改`Master-IP`的地址。如果使用SSL认证，则需要配置`certificate-authority`和`users`中的用户证书。如果是使用非SSL方式访问（比如通过8080端口），也可以去掉这些证书的配置。
-```
-apiVersion: v1
-clusters:
-- cluster:
-    certificate-authority: /path/to/ca.crt
-    server: https://[Master-IP]:443
-  name: minikube
-contexts:
-- context:
-    cluster: minikube
-    user: minikube
-  name: minikube
-current-context: minikube
-kind: Config
-preferences: {}
-users:
-- name: minikube
-  user:
-    client-certificate: /path/to/apiserver.crt
-    client-key: /Users/wuyi/.minikube/apiserver.key
-```
diff --git a/doc/howto/usage/k8s/src/k8s-paddle-arch.png b/doc/howto/usage/k8s/src/k8s-paddle-arch.png
deleted file mode 100644
index 2183a232ad402b76f82a67234a5c93e13ce97ac3..0000000000000000000000000000000000000000
Binary files a/doc/howto/usage/k8s/src/k8s-paddle-arch.png and /dev/null differ
diff --git a/doc/mobile/cross_compiling_for_ios_cn.md b/doc/mobile/cross_compiling_for_ios_cn.md
index 9da48e7f2119ce901fbb3abab73400df27be16d2..d5196d9a4c93c7692d2a624ec7d0650e32806338 100644
--- a/doc/mobile/cross_compiling_for_ios_cn.md
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
@@ -18,11 +18,11 @@ PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/
 
 - `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须设置为`iOS`。在设置`CMAKE_SYSTEM_NAME=iOS`后，PaddlePaddle的CMake系统会自动编译所有的第三方依赖库，并且强制设置一些PaddlePaddle参数的值（`WITH_C_API=ON`、`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`）。
 - `WITH_C_API`，是否编译C-API预测库，必须设置为ON。在iOS平台上只支持使用C-API来预测。
-- `WITH_SWIG_PY`，必须设置为ON。在iOS平台上不支持通过swig调用来训练或者预测。
+- `WITH_SWIG_PY`，必须设置为`OFF`。在iOS平台上不支持通过swig调用来训练或者预测。
 
 iOS平台可选配置参数：
 
-- `IOS_PLATFORM`，可设置为`OS/SIMULATOR`，默认值为`OS`。
+- `IOS_PLATFORM`，可设置为`OS`（默认值）或`SIMULATOR`。
   - `OS`，构建目标为`arm`架构的iPhone或者iPad等物理设备。
   - `SIMULATOR`，构建目标为`x86`架构的模拟器平台。
 - `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示，默认编译所有架构：
diff --git a/doc/mobile/cross_compiling_for_ios_en.md b/doc/mobile/cross_compiling_for_ios_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa390cd61f3fbd75e5a3b342f3559e76da35a918
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_ios_en.md
@@ -0,0 +1,120 @@
+# PaddlePaddle Compiling Guide for iOS
+
+This tutorial will walk you through cross compiling the PaddlePaddle library for iOS from the source in MacOS.
+
+## Preparation
+
+Apple provides Xcode for cross-compiling and IDE for iOS development. Download from App store or [here](https://developer.apple.com/cn/xcode/). To verify your installation, run command as follows
+
+```bash
+$ xcodebuild -version
+Xcode 9.0
+Build version 9A235
+```
+
+## Cross-compiling configurations
+
+PaddlePaddle provides cross-compiling toolchain configuration documentation [cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake), which has some default settings for frequently used compilers.
+
+There are some mandatory environment variables need to be set before cross compiling PaddlePaddle for iOS:
+
+- `CMAKE_SYSTEM_NAME`, CMake compiling target platform name, has to be `iOS`. PaddlePaddle CMake will compile all the third party dependencies and enforce some parameters (`WITH_C_API=ON`, `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`,`WITH_RDMA=OFF`) when this variable is set with value `iOS`.
+
+- `WITH_C_API`, Whether to compile inference C-API library, has to be `ON`, since C-API is the only supported interface for inferencing in iOS.
+- `WITH_SWIG_PY`, has to be `OFF`. It's not supported to inference or train via swig in iOS.
+
+Optional environment variables for iOS are:
+
+- `IOS_PLATFORM`, either `OS` (default) or `SIMULATOR`.
+  - `OS`, build targets ARM-based physical devices like iPhone or iPad.
+  - `SIMULATOR`, build targets x86 architecture simulators.
+- `IOS_ARCH`, target architecture. By default, all architecture types will be compiled. If you need to specify the architecture to compile for, please find valid values for different `IOS_PLATFORM` settings from the table below:
+
+    <table class="docutils">
+    <colgroup>
+      <col width="35%" />
+      <col width="65%" />
+    </colgroup>
+    <thead valign="bottom">
+      <tr class="row-odd">
+      <th class="head">IOS_PLATFORM</th>
+      <th class="head">IOS_ARCH</th>
+    </tr>
+    </thead>
+    <tbody valign="top">
+      <tr class="row-even">
+      <td>OS</td>
+      <td>armv7, armv7s, arm64 </td>
+    </tr>
+    <tr class="row-odd">
+      <td>SIMULATOR</td>
+      <td>i386, x86_64 </td>
+    </tr>
+    </tbody>
+    </table>
+
+- `IOS_DEPLOYMENT_TARGET`, minimum iOS version to deployment, `7.0` by default.
+- `IOS_ENABLE_BITCODE`, whether to enable [Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3), values can be `ON/OFF`, `ON` by default.
+- `IOS_USE_VECLIB_FOR_BLAS`, whether to use [vecLib](https://developer.apple.com/documentation/accelerate/veclib) framework for BLAS computing. values can be `ON/OFF`, `OFF` by default.
+- `IOS_DEVELOPMENT_ROOT`, the path to `Developer` directory, can be explicitly set with your `/path/to/platform/Developer`. If left blank, PaddlePaddle will automatically pick the Xcode corresponding `platform`'s `Developer` directory based on your `IOS_PLATFORM` value.
+- `IOS_SDK_ROOT`, the path to `SDK` root, can be explicitly set with your  `/path/to/platform/Developer/SDKs/SDK`. if left black, PaddlePaddle will pick the latest SDK in the directory of `IOS_DEVELOPMENT_ROOT`.
+
+other settings：
+
+- `USE_EIGEN_FOR_BLAS`, whether to use Eigen for matrix computing. effective when `IOS_USE_VECLIB_FOR_BLAS=OFF`. Values can be `ON/OFF`, `OFF` by default.
+- `HOST_C/CXX_COMPILER`, host C/C++ compiler. Uses value from environment variable `CC/CXX` by default or `cc/c++` if `CC/CXX` doesn't exist.
+
+some typical cmake configurations:
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=OS \
+      -DIOS_ARCH="armv7;arm64" \
+      -DIOS_ENABLE_BITCODE=ON \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=SIMULATOR \
+      -DIOS_ARCH="x86_64" \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+You can set other compiling parameters for your own need. I.E. if you are trying to minimize the library size, set `CMAKE_BUILD_TYPE` with `MinSizeRel`; or if the performance is your concern, set `CMAKE_BUILD_TYPE` with `Release`. You can even manipulate the PaddlePaddle compiling procedure by manually set `CMAKE_C/CXX_FLAGS` values.
+
+**TIPS for a better performance**:
+
+- set `CMAKE_BUILD_TYPE` with `Release`
+- set `IOS_USE_VECLIB_FOR_BLAS` with `ON`
+
+## Compile and install
+
+After CMake, run following commands, PaddlePaddle will download the compile 3rd party dependencies, compile and install PaddlePaddle inference library.
+
+```
+$ make
+$ make install
+```
+
+Please Note: if you compiled PaddlePaddle in the source directory for other platforms, do remove `third_party` and `build` directory within the source with `rm -rf` to ensure that all the 3rd party libraries dependencies and PaddlePaddle is newly compiled with current CMake configuration.
+
+`your/path/to/install` directory will have following directories after `compile` and `install`:
+
+- `include`, contains all the C-API header files.
+- `lib`, contains PaddlePaddle C-API static library.
+- `third_party` contains all the 3rd party libraries.
+
+Please note: if PaddlePaddle library need to support both physical devices and simulators, you will need to compile correspondingly, then merge fat library with `lipo`.
+
+Now you will have PaddlePaddle library compiled and installed, the fat library can be used in deep learning related iOS APPs. Please refer to C-API documentation for usage guides.
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
index 3c08d736717cfe8d5fdf449dc58015086befbe60..ef421dacad458828cadf8cf505375d6c4bfd9dde 100644
--- a/doc/mobile/index_en.rst
+++ b/doc/mobile/index_en.rst
@@ -5,4 +5,5 @@ MOBILE
   :maxdepth: 1
 
   cross_compiling_for_android_en.md
+  cross_compiling_for_ios_en.md
   cross_compiling_for_raspberry_en.md
diff --git a/go/pserver/client/c/test/test_cclient.c b/go/pserver/client/c/test/test_cclient.c
index 89c4d7f00aae2a92ae30ba7b4305550d150dd985..05ec421fff6e1c57b0bace080668d3793f85480f 100644
--- a/go/pserver/client/c/test/test_cclient.c
+++ b/go/pserver/client/c/test/test_cclient.c
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 4b0eff3adb6fff0c9599b8613c5f19daea840674..738684795d8170ffd5c5b2bf19e6e150219332d0 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -21,6 +21,8 @@ cc_test(variable_test SRCS variable_test.cc)
 cc_library(scope SRCS scope.cc DEPS glog)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 
+cc_library(data_transform SRCS data_transform.cc DEPS tensor framework_proto)
+cc_test(data_transform_test SRCS data_transform_test.cc DEPS data_transform device_context)
 
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
@@ -29,12 +31,13 @@ cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute)
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference)
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
+    shape_inference data_transform)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
-cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
@@ -58,3 +61,10 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
         proto_desc)
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
+
+cc_library(threadpool SRCS threadpool.cc)
+cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
+cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece)
+cc_test(init_test SRCS init_test.cc DEPS init)
+
+cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc
index b1e17936417e4ce09bace1d1a5d346d1c9cfa710..b0fd4d2750eb2529706d871947332d39494505cd 100644
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -19,42 +19,42 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
+Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
   switch (attr_desc.type()) {
-    case framework::AttrType::BOOLEAN: {
+    case proto::AttrType::BOOLEAN: {
       return attr_desc.b();
     }
-    case framework::AttrType::INT: {
+    case proto::AttrType::INT: {
       return attr_desc.i();
     }
-    case framework::AttrType::FLOAT: {
+    case proto::AttrType::FLOAT: {
       return attr_desc.f();
     }
-    case framework::AttrType::STRING: {
+    case proto::AttrType::STRING: {
       return attr_desc.s();
     }
-    case framework::AttrType::BOOLEANS: {
+    case proto::AttrType::BOOLEANS: {
       std::vector<bool> val(attr_desc.bools_size());
       for (int i = 0; i < attr_desc.bools_size(); ++i) {
         val[i] = attr_desc.bools(i);
       }
       return val;
     }
-    case framework::AttrType::INTS: {
+    case proto::AttrType::INTS: {
       std::vector<int> val(attr_desc.ints_size());
       for (int i = 0; i < attr_desc.ints_size(); ++i) {
         val[i] = attr_desc.ints(i);
       }
       return val;
     }
-    case framework::AttrType::FLOATS: {
+    case proto::AttrType::FLOATS: {
       std::vector<float> val(attr_desc.floats_size());
       for (int i = 0; i < attr_desc.floats_size(); ++i) {
         val[i] = attr_desc.floats(i);
       }
       return val;
     }
-    case framework::AttrType::STRINGS: {
+    case proto::AttrType::STRINGS: {
       std::vector<std::string> val(attr_desc.strings_size());
       for (int i = 0; i < attr_desc.strings_size(); ++i) {
         val[i] = attr_desc.strings(i);
diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
index 0641907d6ff7546df1601d3b0263ff42f4186968..c1c63d9cb13acb195b3bc3b30088f5fa7daf2a3d 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -27,12 +27,12 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 template <typename T>
-inline AttrType AttrTypeID() {
+inline proto::AttrType AttrTypeID() {
   Attribute tmp = T();
-  return static_cast<AttrType>(tmp.which() - 1);
+  return static_cast<proto::AttrType>(tmp.which() - 1);
 }
 
-Attribute GetAttrValue(const OpDesc::Attr& attr_desc);
+Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc);
 
 class AttrReader {
  public:
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index faf6e60cbd1bcda9864c12696b336998ea7606b7..eaf13ddcefcd8dc5a6b0438f765d8d325925aa30 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/backward.h"
 #include "paddle/operators/net_op.h"
@@ -42,7 +42,7 @@ static std::unordered_set<std::string>& CtrlFlowOps() {
 static inline std::unique_ptr<OperatorBase> CreateGradOp(
     const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
     std::unordered_map<std::string, std::string>* grad_to_var) {
-  OpDescBind op_desc;
+  OpDesc op_desc;
   op_desc.SetInputMap(op.Inputs());
   op_desc.SetOutputMap(op.Outputs());
   op_desc.SetType(op.Type());
@@ -53,7 +53,7 @@ static inline std::unique_ptr<OperatorBase> CreateGradOp(
   grad_ops.reserve(grad_descs.size());
   std::transform(grad_descs.begin(), grad_descs.end(),
                  std::back_inserter(grad_ops),
-                 [](const std::unique_ptr<OpDescBind>& grad_desc) {
+                 [](const std::unique_ptr<OpDesc>& grad_desc) {
                    return OpRegistry::CreateOp(*grad_desc);
                  });
   PADDLE_ENFORCE(!grad_ops.empty());
@@ -217,7 +217,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
         // If part of input gradient of that operator is not calculated, fill
         // zero variables to that input gradient.
         net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
-                                           {{"Y", {grad_input}}},
+                                           {{"Out", {grad_input}}},
                                            AttributeMap{}));
       }
       return false;
@@ -296,7 +296,7 @@ static std::string FwdName(const std::string& grad_name) {
 static void CreateGradVarInBlock(
     size_t grad_op_start_index,
     const std::unordered_map<std::string, std::string>& param_name_map,
-    BlockDescBind* block_desc,
+    BlockDesc* block_desc,
     std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
   auto ops = block_desc->AllOps();
   for (size_t op_index = grad_op_start_index; op_index < ops.size();
@@ -341,7 +341,7 @@ static void CreateGradVarInBlock(
       auto* param = block_desc->FindVarRecursive(pname);
       auto* grad = block_desc->FindVar(arg);
       if (param == nullptr) {
-        grad->SetDataType(DataType::FP32);
+        grad->SetDataType(proto::DataType::FP32);
       } else {
         grad->SetDataType(param->GetDataType());
       }
@@ -350,12 +350,11 @@ static void CreateGradVarInBlock(
   }
 }
 
-std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
-    const OpDescBind* op_desc, std::unordered_set<std::string>* no_grad_vars,
+std::vector<std::unique_ptr<OpDesc>> MakeOpGrad(
+    const OpDesc* op_desc, std::unordered_set<std::string>* no_grad_vars,
     std::unordered_map<std::string, std::string>* grad_to_var,
-    const std::vector<BlockDescBind*>& grad_block =
-        std::vector<BlockDescBind*>()) {
-  std::vector<std::unique_ptr<OpDescBind>> grad_op_descs;
+    const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>()) {
+  std::vector<std::unique_ptr<OpDesc>> grad_op_descs;
   // All input gradients of forwarding operator do not need to calculate.
   const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
   if (AllGradInSet(inputs, *no_grad_vars)) {
@@ -386,7 +385,7 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
           .Get(op_desc->Type())
           .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block);
 
-  std::list<std::unique_ptr<OpDescBind>> pending_fill_zeros_ops;
+  std::list<std::unique_ptr<OpDesc>> pending_fill_zeros_ops;
   for (auto& desc : grad_op_descs) {
     for (const std::string& in_name : desc->InputArgumentNames()) {
       if (no_grad_vars->count(in_name)) {
@@ -394,9 +393,9 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
             0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
         std::string new_name = prefix + kZeroVarSuffix;
         desc->Rename(in_name, new_name);
-        std::unique_ptr<OpDescBind> fill_zeros_op(
-            new OpDescBind("fill_zeros_like", {{"X", {prefix}}},
-                           {{"Y", {new_name}}}, AttributeMap{}));
+        std::unique_ptr<OpDesc> fill_zeros_op(
+            new OpDesc("fill_zeros_like", {{"X", {prefix}}},
+                       {{"Out", {new_name}}}, AttributeMap{}));
         pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
       }
     }
@@ -408,34 +407,33 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
   return grad_op_descs;
 }
 
-static BlockDescBind* CreateStepBlock(
-    ProgramDescBind& program_desc,
-    std::unordered_set<std::string>* no_grad_vars,
+static BlockDesc* CreateStepBlock(
+    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
     std::unordered_map<std::string, std::string>* grad_to_var,
     int step_block_idx);
 
-std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
-    ProgramDescBind& program_desc, int block_idx,
+std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
+    ProgramDesc& program_desc, int block_idx,
     std::unordered_set<std::string>* no_grad_vars,
     std::unordered_map<std::string, std::string>* grad_to_var) {
   VLOG(5) << "MakeBlockBackward";
-  BlockDescBind* cur_block = program_desc.MutableBlock(block_idx);
-  std::vector<OpDescBind*> op_descs = cur_block->AllOps();
+  BlockDesc* cur_block = program_desc.MutableBlock(block_idx);
+  std::vector<OpDesc*> op_descs = cur_block->AllOps();
   std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
   size_t grad_desc_idx = 0;
-  std::vector<std::unique_ptr<OpDescBind>> backward_descs;
+  std::vector<std::unique_ptr<OpDesc>> backward_descs;
 
   for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
     VLOG(5) << "Making backward " << (*it)->Type() << " op";
-    std::vector<std::unique_ptr<OpDescBind>> op_grads;
+    std::vector<std::unique_ptr<OpDesc>> op_grads;
 
     if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
       int step_block_idx = (*it)->GetBlockAttr("sub_block");
-      BlockDescBind* backward_block = CreateStepBlock(
-          program_desc, no_grad_vars, grad_to_var, step_block_idx);
+      BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
+                                                  grad_to_var, step_block_idx);
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
     } else if ((*it)->Type() == "conditional_block") {
-      BlockDescBind* backward_block =
+      BlockDesc* backward_block =
           CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
                           (*it)->GetBlockAttr("sub_block"));
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
@@ -463,14 +461,14 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
       }
       ++grad_desc_idx;
     }
-    std::transform(
-        op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
-        [](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
+    std::transform(op_grads.begin(), op_grads.end(),
+                   std::back_inserter(backward_descs),
+                   [](std::unique_ptr<OpDesc>& ptr) { return std::move(ptr); });
   }
 
   VLOG(5) << "Appending Sums";
   // Check whether some variables are written more than once
-  std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
+  std::list<std::pair<size_t, std::unique_ptr<OpDesc>>> pending_sum_ops;
   for (const auto& dup : dup_out_ops) {
     const std::string& out_name = dup.first;
     const std::vector<size_t> dup_op = dup.second;
@@ -486,18 +484,17 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
         sum_op_inputs.emplace_back(new_name);
         next_g_name = sum_op_inputs.back();
       }
-      std::unique_ptr<OpDescBind> sum_op(
-          new OpDescBind("sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}},
-                         AttributeMap{}));
+      std::unique_ptr<OpDesc> sum_op(new OpDesc("sum", {{"X", sum_op_inputs}},
+                                                {{"Out", {out_name}}},
+                                                AttributeMap{}));
       pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
     }
   }
 
-  pending_sum_ops.sort(
-      [](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
-         const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
-        return a.first > b.first;
-      });
+  pending_sum_ops.sort([](const std::pair<size_t, std::unique_ptr<OpDesc>>& a,
+                          const std::pair<size_t, std::unique_ptr<OpDesc>>& b) {
+    return a.first > b.first;
+  });
   for (auto& p : pending_sum_ops) {
     backward_descs.insert(backward_descs.begin() + p.first + 1,
                           std::move(p.second));
@@ -508,14 +505,13 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
   return backward_descs;
 }
 
-static BlockDescBind* CreateStepBlock(
-    ProgramDescBind& program_desc,
-    std::unordered_set<std::string>* no_grad_vars,
+static BlockDesc* CreateStepBlock(
+    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
     std::unordered_map<std::string, std::string>* grad_to_var,
     int step_block_idx) {
   auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
                                                    no_grad_vars, grad_to_var);
-  BlockDescBind* backward_block =
+  BlockDesc* backward_block =
       program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
   for (auto& ptr : backward_block_op_descs) {
     backward_block->AppendAllocatedOp(move(ptr));
@@ -524,7 +520,7 @@ static BlockDescBind* CreateStepBlock(
 }
 
 ParamGradInfoMap AppendBackward(
-    ProgramDescBind& program_desc, const VarDescBind& target,
+    ProgramDesc& program_desc, const VarDesc& target,
     const std::unordered_set<std::string>& no_grad_vars) {
   std::unordered_set<std::string> no_grad_var_names;
   no_grad_var_names.reserve(no_grad_vars.size() + 1);
@@ -541,11 +537,11 @@ ParamGradInfoMap AppendBackward(
   PADDLE_ENFORCE(is_scalar, "target should be scalar");
   VLOG(3) << "backward from loss=" << target.Name()
           << " data_type=" << target.GetDataType();
-  std::unique_ptr<OpDescBind> fill_one_op(
-      new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
-                     {{"shape", std::vector<int>{1}},
-                      {"value", static_cast<float>(1.0)},
-                      {"dtype", target.GetDataType()}}));
+  std::unique_ptr<OpDesc> fill_one_op(
+      new OpDesc("fill_constant", {}, {{"Out", {fill_one_op_out}}},
+                 {{"shape", std::vector<int>{1}},
+                  {"value", static_cast<float>(1.0)},
+                  {"dtype", target.GetDataType()}}));
   // infer var type of fill_one_op
   fill_one_op->InferVarType(root_block);
 
diff --git a/paddle/framework/backward.h b/paddle/framework/backward.h
index 96154fa82cb7a486aa4762ae633982ed6735220b..69ee3802369c16a8b21c0710d2008ef3c085cc5c 100644
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -49,7 +49,7 @@ using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
                                             GradVarInfo /*grad_var_info*/>;
 
 ParamGradInfoMap AppendBackward(
-    ProgramDescBind& program_desc, const VarDescBind& target,
+    ProgramDesc& program_desc, const VarDesc& target,
     const std::unordered_set<std::string>& no_grad_vars);
 
 }  // namespace framework
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 9fe49881d5b740655432f6e83a7886878ceb17e8..692406b1c37d0c02714eafb5cf9a28329ed873bc 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/backward.h"
 
@@ -58,13 +58,13 @@ class RowWiseAddGradMaker : public SingleGradOpDescMaker {
   using SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<OpDescBind> Apply() const override {
-    auto grad_op = new OpDescBind();
+  std::unique_ptr<OpDesc> Apply() const override {
+    auto grad_op = new OpDesc();
     grad_op->SetInput(GradVarName("Out"), OutputGrad("Out"));
     grad_op->SetOutput(GradVarName("X"), InputGrad("X"));
     grad_op->SetOutput(GradVarName("b"), InputGrad("b"));
     grad_op->SetType("rowwise_add_grad");
-    return std::unique_ptr<OpDescBind>(grad_op);
+    return std::unique_ptr<OpDesc>(grad_op);
   }
 };
 
@@ -159,14 +159,14 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
   FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "x");
-    AddOutput("Y", "out");
+    AddOutput("Out", "out");
     AddComment("");
   }
 };
 
 class SumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SumOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "the input tensors of sum operator.").AsDuplicable();
     AddOutput("Out", "the output tensor of sum operator.");
@@ -190,11 +190,11 @@ class MinusGradOpDescMaker : public GradOpDescMakerBase {
  public:
   using GradOpDescMakerBase::GradOpDescMakerBase;
 
-  std::vector<std::unique_ptr<OpDescBind>> operator()() const override {
-    std::vector<std::unique_ptr<OpDescBind>> retv;
+  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
+    std::vector<std::unique_ptr<OpDesc>> retv;
     auto x_g = InputGrad("X");
     if (!x_g.empty()) {
-      auto *op_desc = new OpDescBind();
+      auto *op_desc = new OpDesc();
       op_desc->SetType("scale");
       op_desc->SetInput("X", OutputGrad("Out"));
       op_desc->SetOutput("Out", x_g);
@@ -204,7 +204,7 @@ class MinusGradOpDescMaker : public GradOpDescMakerBase {
 
     auto y_g = InputGrad("Y");
     if (!y_g.empty()) {
-      auto *op_desc = new OpDescBind();
+      auto *op_desc = new OpDesc();
       op_desc->SetType("scale");
       op_desc->SetInput("X", OutputGrad("Out"));
       op_desc->SetOutput("Out", y_g);
@@ -430,8 +430,8 @@ TEST(Backward, op_part_of_output_are_not_need) {
   ASSERT_EQ("fill_zeros_like", fill_zero.Type());
   ASSERT_EQ(1UL, fill_zero.Inputs("X").size());
   ASSERT_EQ("Z", fill_zero.Input("X"));
-  ASSERT_EQ(1UL, fill_zero.Outputs("Y").size());
-  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Y"));
+  ASSERT_EQ(1UL, fill_zero.Outputs("Out").size());
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Out"));
 
   auto &d_many_out = *net->ops_[1];
   ASSERT_EQ("many_output_op_grad", d_many_out.Type());
@@ -505,25 +505,25 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
 }
 
 TEST(Backward, simple_single_op) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
-  f::OpDescBind *op = block->AppendOp();
+  f::OpDesc *op = block->AppendOp();
   op->SetType("rowwise_add");
   op->SetInput("X", {"x"});
   op->SetInput("b", {"b"});
   op->SetOutput("Out", {"out"});
 
-  auto target = f::VarDescBind("out");
+  auto target = f::VarDesc("out");
   target.SetShape({1});
   auto var_to_grad =
       AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 3UL);
-  f::OpDescBind *fill_op = block->AllOps()[1];
+  f::OpDesc *fill_op = block->AllOps()[1];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op = block->AllOps()[2];
+  f::OpDesc *grad_op = block->AllOps()[2];
   EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
@@ -543,16 +543,16 @@ TEST(Backward, simple_single_op) {
 }
 
 TEST(Backward, default_attribute) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op = block->AppendOp();
   op->SetType("mul");
   op->SetInput("X", {"x"});
   op->SetInput("Y", {"y"});
   op->SetOutput("Out", {"out"});
   op->CheckAttrs();
 
-  auto target = f::VarDescBind("out");
+  auto target = f::VarDesc("out");
   target.SetShape({1});
   AppendBackward(program, target, std::unordered_set<std::string>{});
 
@@ -560,47 +560,47 @@ TEST(Backward, default_attribute) {
   EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
   EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);
 
-  f::OpDescBind *fill_op = block->AllOps()[1];
+  f::OpDesc *fill_op = block->AllOps()[1];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op = block->AllOps()[2];
+  f::OpDesc *grad_op = block->AllOps()[2];
   ASSERT_EQ(grad_op->Type(), "mul_grad");
   EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
   EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
 }
 
 TEST(Backward, simple_mult_op) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
   op1->SetType("rowwise_add");
   op1->SetInput("X", {"x1"});
   op1->SetInput("b", {"b1"});
   op1->SetOutput("Out", {"out1"});
 
-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
   op2->SetType("mul");
   op2->SetInput("X", {"out1"});
   op2->SetInput("Y", {"y2"});
   op2->SetOutput("Out", {"out2"});
 
-  f::OpDescBind *op3 = block->AppendOp();
+  f::OpDesc *op3 = block->AppendOp();
   op3->SetType("rowwise_add");
   op3->SetInput("X", {"out2"});
   op3->SetInput("b", {"b3"});
   op3->SetOutput("Out", {"out3"});
 
-  auto target = f::VarDescBind("out3");
+  auto target = f::VarDesc("out3");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad =
       AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 6UL + 1);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op1 = block->AllOps()[6];
+  f::OpDesc *grad_op1 = block->AllOps()[6];
   EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -611,7 +611,7 @@ TEST(Backward, simple_mult_op) {
   EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
             std::vector<std::string>({f::GradVarName("b1")}));
 
-  f::OpDescBind *grad_op2 = block->AllOps()[5];
+  f::OpDesc *grad_op2 = block->AllOps()[5];
   EXPECT_EQ(grad_op2->Type(), "mul_grad");
   ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
   ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
@@ -625,7 +625,7 @@ TEST(Backward, simple_mult_op) {
   EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
             std::vector<std::string>({f::GradVarName("y2")}));
 
-  f::OpDescBind *grad_op3 = block->AllOps()[4];
+  f::OpDesc *grad_op3 = block->AllOps()[4];
   EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
@@ -655,42 +655,42 @@ TEST(Backward, simple_mult_op) {
 }
 
 TEST(Backward, intermedia_var_no_grad) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
   op1->SetType("rowwise_add");
   op1->SetInput("X", {"x1"});
   op1->SetInput("b", {"b1"});
   op1->SetOutput("Out", {"out1"});
 
-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
   op2->SetType("mul");
   op2->SetInput("X", {"x2"});
   op2->SetInput("Y", {"y2"});
   op2->SetOutput("Out", {"out2"});
 
-  f::OpDescBind *op3 = block->AppendOp();
+  f::OpDesc *op3 = block->AppendOp();
   op3->SetType("rowwise_add");
   op3->SetInput("X", {"out2"});
   op3->SetInput("b", {"b3"});
   op3->SetOutput("Out", {"out3"});
 
-  f::OpDescBind *op4 = block->AppendOp();
+  f::OpDesc *op4 = block->AppendOp();
   op4->SetType("mul");
   op4->SetInput("X", {"out1"});
   op4->SetInput("Y", {"out3"});
   op4->SetOutput("Out", {"out4"});
 
-  auto target = f::VarDescBind("out4");
+  auto target = f::VarDesc("out4");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {"out3"});
 
   ASSERT_EQ(block->AllOps().size(), 7UL);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op1 = block->AllOps()[6];
+  f::OpDesc *grad_op1 = block->AllOps()[6];
   EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -701,7 +701,7 @@ TEST(Backward, intermedia_var_no_grad) {
   EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
             std::vector<std::string>({f::GradVarName("b1")}));
 
-  f::OpDescBind *grad_op4 = block->AllOps()[5];
+  f::OpDesc *grad_op4 = block->AllOps()[5];
   EXPECT_EQ(grad_op4->Type(), "mul_grad");
   ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
   ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
@@ -726,32 +726,32 @@ TEST(Backward, intermedia_var_no_grad) {
 }
 
 TEST(Backward, var_no_grad) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
   op1->SetType("mult_in_out");
   op1->SetInput("X", {"x1"});
   op1->SetInput("H", {"h1"});
   op1->SetOutput("Y", {"y1"});
   op1->SetOutput("Z", {"z1"});
 
-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
   op2->SetType("mult_in_out");
   op2->SetInput("X", {"y1"});
   op2->SetInput("H", {"z1"});
   op2->SetOutput("Y", {"y2"});
   op2->SetOutput("Z", {"z2"});
 
-  auto target = f::VarDescBind("z2");
+  auto target = f::VarDesc("z2");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {"z1"});
 
   ASSERT_EQ(block->AllOps().size(), 6UL);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op2 = block->AllOps()[3];
+  f::OpDesc *grad_op2 = block->AllOps()[3];
   ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
   ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
   ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
@@ -767,15 +767,15 @@ TEST(Backward, var_no_grad) {
             std::vector<std::string>({f::GradVarName("y1")}));
   EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector<std::string>());
 
-  f::OpDescBind *fill_zero_op = block->AllOps()[4];
+  f::OpDesc *fill_zero_op = block->AllOps()[4];
   ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
   ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
   ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
   EXPECT_EQ(fill_zero_op->Input("X"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(fill_zero_op->Output("Y"),
+  EXPECT_EQ(fill_zero_op->Output("Out"),
             std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
 
-  f::OpDescBind *grad_op1 = block->AllOps()[5];
+  f::OpDesc *grad_op1 = block->AllOps()[5];
   ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
   ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
   ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -803,37 +803,37 @@ TEST(Backward, var_no_grad) {
 }
 
 TEST(Backward, shared_var) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
   op1->SetType("rowwise_add");
   op1->SetInput("X", {"x1"});
   op1->SetInput("b", {"b1"});
   op1->SetOutput("Out", {"out1"});
 
-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
   op2->SetType("mul");
   op2->SetInput("X", {"out1"});
   op2->SetInput("Y", {"y2"});
   op2->SetOutput("Out", {"out2"});
 
-  f::OpDescBind *op3 = block->AppendOp();
+  f::OpDesc *op3 = block->AppendOp();
   op3->SetType("rowwise_add");
   op3->SetInput("X", {"out1"});
   op3->SetInput("b", {"b3"});
   op3->SetOutput("Out", {"out3"});
 
-  auto target = f::VarDescBind("out3");
+  auto target = f::VarDesc("out3");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad =
       AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 8UL);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op3 = block->AllOps()[4];
+  f::OpDesc *grad_op3 = block->AllOps()[4];
   ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
@@ -844,7 +844,7 @@ TEST(Backward, shared_var) {
   EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
             std::vector<std::string>({f::GradVarName("b3")}));
 
-  f::OpDescBind *grad_op4 = block->AllOps()[5];
+  f::OpDesc *grad_op4 = block->AllOps()[5];
   ASSERT_EQ(grad_op4->Type(), "mul_grad");
   ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
   ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
@@ -858,7 +858,7 @@ TEST(Backward, shared_var) {
   EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
             std::vector<std::string>({f::GradVarName("y2")}));
 
-  f::OpDescBind *sum_op = block->AllOps()[6];
+  f::OpDesc *sum_op = block->AllOps()[6];
   ASSERT_EQ(sum_op->Type(), "sum");
   ASSERT_EQ(sum_op->InputNames().size(), 1UL);
   ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
@@ -868,7 +868,7 @@ TEST(Backward, shared_var) {
   EXPECT_EQ(sum_op->Output("Out"),
             std::vector<std::string>({f::GradVarName("out1")}));
 
-  f::OpDescBind *grad_op1 = block->AllOps()[7];
+  f::OpDesc *grad_op1 = block->AllOps()[7];
   ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -895,19 +895,19 @@ TEST(Backward, shared_var) {
 }
 
 TEST(Backward, half_backward) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
   auto *op1 = block->AppendOp();
   op1->SetType("minus");
   op1->SetInput("X", {"a"});
   op1->SetInput("Y", {"b"});
   op1->SetOutput("Out", {"out"});
 
-  auto target = f::VarDescBind("out");
+  auto target = f::VarDesc("out");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {"b"});
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
   auto ops = block->AllOps();
   ASSERT_EQ(3UL, ops.size());
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index 6a7a07d5cf471a32822cdccf5c616d8748fd1bd7..0668b08ff7ab3c8ca4f1e989fc7af45a8ec5f63c 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -19,18 +19,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-VarDescBind *BlockDescBind::Var(const std::string &name) {
+VarDesc *BlockDesc::Var(const std::string &name) {
   auto it = vars_.find(name);
   if (it != vars_.end()) {
     return it->second.get();
   }
   need_update_ = true;
-  auto *var = new VarDescBind(name);
+  auto *var = new VarDesc(name);
   vars_[name].reset(var);
   return var;
 }
 
-VarDescBind *BlockDescBind::FindVar(const std::string &name) const {
+VarDesc *BlockDesc::FindVar(const std::string &name) const {
   auto it = vars_.find(name);
   if (it == vars_.end()) {
     return nullptr;
@@ -38,11 +38,11 @@ VarDescBind *BlockDescBind::FindVar(const std::string &name) const {
   return it->second.get();
 }
 
-bool BlockDescBind::HasVar(const std::string &name) const {
+bool BlockDesc::HasVar(const std::string &name) const {
   return vars_.find(name) != vars_.end();
 }
 
-VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
+VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const {
   if (name == kEmptyVarName) return nullptr;
 
   auto it = vars_.find(name);
@@ -53,53 +53,67 @@ VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
   return it->second.get();
 }
 
-VarDescBind *BlockDescBind::FindRecursiveOrCreateVar(
-    const std::string &name_bytes) {
-  VarDescBind *res = FindVarRecursive(name_bytes);
+VarDesc *BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) {
+  VarDesc *res = FindVarRecursive(name_bytes);
   if (res == nullptr) {
     res = Var(name_bytes);
   }
   return res;
 }
 
-bool BlockDescBind::HasVarRecursive(const std::string &name) const {
+bool BlockDesc::HasVarRecursive(const std::string &name) const {
   return FindVarRecursive(name) != nullptr;
 }
 
-std::vector<VarDescBind *> BlockDescBind::AllVars() const {
-  std::vector<VarDescBind *> res;
+std::vector<VarDesc *> BlockDesc::AllVars() const {
+  std::vector<VarDesc *> res;
   for (const auto &p : vars_) {
     res.push_back(p.second.get());
   }
   return res;
 }
 
-OpDescBind *BlockDescBind::AppendOp() {
+OpDesc *BlockDesc::AppendOp() {
   need_update_ = true;
-  ops_.emplace_back(new OpDescBind());
+  ops_.emplace_back(new OpDesc());
   return ops_.back().get();
 }
 
-void BlockDescBind::AppendAllocatedOp(std::unique_ptr<OpDescBind> &&op_desc) {
+void BlockDesc::AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {
   need_update_ = true;
   ops_.emplace_back(std::move(op_desc));
 }
 
-OpDescBind *BlockDescBind::PrependOp() {
+OpDesc *BlockDesc::PrependOp() {
   need_update_ = true;
-  ops_.emplace_front(new OpDescBind());
+  ops_.emplace_front(new OpDesc());
   return ops_.front().get();
 }
 
-std::vector<OpDescBind *> BlockDescBind::AllOps() const {
-  std::vector<OpDescBind *> res;
+void BlockDesc::RemoveOp(size_t s, size_t e) {
+  if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
+    return;
+  }
+  need_update_ = true;
+  for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) {
+    auto names = (*it)->InputArgumentNames();
+    for (auto n : names) {
+      // TODO(typhoonzero): delete vars if no other op use it.
+      VLOG(3) << "deleting var " << n;
+    }
+  }
+  ops_.erase(ops_.begin() + s, ops_.begin() + e);
+}
+
+std::vector<OpDesc *> BlockDesc::AllOps() const {
+  std::vector<OpDesc *> res;
   for (const auto &op : ops_) {
     res.push_back(op.get());
   }
   return res;
 }
 
-void BlockDescBind::Flush() {
+void BlockDesc::Flush() {
   for (auto &op_desc : ops_) {
     op_desc->Flush();
   }
@@ -121,43 +135,43 @@ void BlockDescBind::Flush() {
   }
 }
 
-BlockDescBind *BlockDescBind::ParentBlock() const {
+BlockDesc *BlockDesc::ParentBlock() const {
   if (this->desc_->parent_idx() == kNoneBlockIndex) {
     return nullptr;
   }
   return prog_->MutableBlock(static_cast<size_t>(this->desc_->parent_idx()));
 }
 
-BlockDesc *BlockDescBind::Proto() {
+proto::BlockDesc *BlockDesc::Proto() {
   Flush();
   return desc_;
 }
 
-BlockDescBind::BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
+BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
     : prog_(prog), desc_(desc), need_update_(false) {
-  for (const VarDesc &var_desc : desc_->vars()) {
-    vars_[var_desc.name()].reset(new VarDescBind(var_desc));
+  for (const proto::VarDesc &var_desc : desc_->vars()) {
+    vars_[var_desc.name()].reset(new VarDesc(var_desc));
   }
-  for (const OpDesc &op_desc : desc_->ops()) {
-    ops_.emplace_back(new OpDescBind(op_desc, prog));
+  for (const proto::OpDesc &op_desc : desc_->ops()) {
+    ops_.emplace_back(new OpDesc(op_desc, prog));
   }
 }
 
-BlockDescBind::BlockDescBind(const BlockDescBind &other, BlockDesc *desc,
-                             ProgramDescBind *prog)
+BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
+                     ProgramDesc *prog)
     : prog_(prog), desc_(desc) {
   need_update_ = true;
   for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDescBind(*op));
+    ops_.emplace_back(new OpDesc(*op));
   }
 
   for (auto &it : other.vars_) {
-    auto *var = new VarDescBind(*it.second);
+    auto *var = new VarDesc(*it.second);
     vars_[it.first].reset(var);
   }
 }
 
-void BlockDescBind::ClearPBOps() {
+void BlockDesc::ClearPBOps() {
   auto ops = this->desc_->mutable_ops();
   while (!ops->empty()) {
     // we do not own the OpDesc, so release the ownership.
@@ -165,7 +179,7 @@ void BlockDescBind::ClearPBOps() {
   }
 }
 
-void BlockDescBind::ClearPBVars() {
+void BlockDesc::ClearPBVars() {
   auto vars = this->desc_->mutable_vars();
   while (!vars->empty()) {
     // we do not own the VarDesc, so release the ownership.
diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
index 8e967e5378eb47a7869efb59cc96a271f1cbb9a1..6c8c81b332d99e52db41018e117aa837be6745bc 100644
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -28,20 +28,19 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class ProgramDescBind;
+class ProgramDesc;
 
 // Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
 // read/write speed. Only when we want the protobuf message, the local changes
 // will be synchronized (by `Sync` method).
 
-class BlockDescBind {
+class BlockDesc {
  public:
-  BlockDescBind(ProgramDescBind *prog, BlockDesc *desc);
+  BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc);
 
-  BlockDescBind(const BlockDescBind &other, BlockDesc *desc,
-                ProgramDescBind *prog);
+  BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog);
 
-  ~BlockDescBind() {
+  ~BlockDesc() {
     this->ClearPBVars();
     this->ClearPBOps();
   }
@@ -50,15 +49,15 @@ class BlockDescBind {
 
   int32_t Parent() const { return desc_->parent_idx(); }
 
-  VarDescBind *Var(const std::string &name_bytes);
+  VarDesc *Var(const std::string &name_bytes);
 
-  VarDescBind *FindVar(const std::string &name_bytes) const;
+  VarDesc *FindVar(const std::string &name_bytes) const;
 
   bool HasVar(const std::string &var_name) const;
 
-  VarDescBind *FindVarRecursive(const std::string &name_bytes) const;
+  VarDesc *FindVarRecursive(const std::string &name_bytes) const;
 
-  VarDescBind *FindRecursiveOrCreateVar(const std::string &name_bytes);
+  VarDesc *FindRecursiveOrCreateVar(const std::string &name_bytes);
 
   bool HasVarRecursive(const std::string &var_name) const;
 
@@ -70,41 +69,43 @@ class BlockDescBind {
     return var_names;
   }
 
-  std::vector<VarDescBind *> AllVars() const;
+  std::vector<VarDesc *> AllVars() const;
 
-  BlockDescBind *ParentBlock() const;
+  BlockDesc *ParentBlock() const;
 
-  OpDescBind *AppendOp();
+  OpDesc *AppendOp();
 
-  void AppendAllocatedOp(std::unique_ptr<OpDescBind> &&op_desc);
+  void AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc);
 
-  OpDescBind *PrependOp();
+  OpDesc *PrependOp();
 
-  std::vector<OpDescBind *> AllOps() const;
+  void RemoveOp(size_t s, size_t e);
+
+  std::vector<OpDesc *> AllOps() const;
 
   size_t OpSize() const { return ops_.size(); }
 
-  OpDescBind *Op(int idx) { return ops_.at(idx).get(); }
+  OpDesc *Op(int idx) { return ops_.at(idx).get(); }
 
   void Flush();
 
-  BlockDesc *Proto();
+  proto::BlockDesc *Proto();
 
-  ProgramDescBind *Program() { return this->prog_; }
+  ProgramDesc *Program() { return this->prog_; }
 
  private:
   void ClearPBOps();
   void ClearPBVars();
 
  private:
-  ProgramDescBind *prog_;  // not_own
-  BlockDesc *desc_;        // not_own
+  ProgramDesc *prog_;       // not_own
+  proto::BlockDesc *desc_;  // not_own
   bool need_update_;
 
-  std::deque<std::unique_ptr<OpDescBind>> ops_;
-  std::unordered_map<std::string, std::unique_ptr<VarDescBind>> vars_;
+  std::deque<std::unique_ptr<OpDesc>> ops_;
+  std::unordered_map<std::string, std::unique_ptr<VarDesc>> vars_;
 
-  DISABLE_COPY_AND_ASSIGN(BlockDescBind);
+  DISABLE_COPY_AND_ASSIGN(BlockDesc);
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/data_layout.h b/paddle/framework/data_layout.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a8669c3a41fceaad26878a79eabfd0affce86fd
--- /dev/null
+++ b/paddle/framework/data_layout.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/platform/enforce.h"
+
+#include <iostream>
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+enum class DataLayout {
+  kNHWC = 0,
+  kNCHW = 1,
+  kAnyLayout = 2,
+};
+
+inline DataLayout StringToDataLayout(const std::string& str) {
+  if (str == "NHWC" || str == "nhwc") {
+    return DataLayout::kNHWC;
+  } else if (str == "NCHW" || str == "nchw") {
+    return DataLayout::kNCHW;
+  } else {
+    PADDLE_THROW("Unknown storage order string: %s", str);
+  }
+}
+
+inline std::string DataLayoutToString(const DataLayout& data_layout) {
+  switch (data_layout) {
+    case DataLayout::kNHWC:
+      return "NHWC";
+    case DataLayout::kNCHW:
+      return "NCHW";
+    case DataLayout::kAnyLayout:
+      return "ANY_LAYOUT";
+    default:
+      PADDLE_THROW("unknown DataLayou %d", data_layout);
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& out, DataLayout l) {
+  out << DataLayoutToString(l);
+  return out;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..376268888e70b0a70060c81384f79f8bf5d6dcc5
--- /dev/null
+++ b/paddle/framework/data_transform.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/data_transform.h"
+#include "paddle/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+
+DataTransformFnMap& DataTransformFnMap::Instance() {
+  static DataTransformFnMap data_transform_map;
+  return data_transform_map;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_transform.h b/paddle/framework/data_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..2191dd3783d5ed7bb59b96c70d38a72bb0b2fee7
--- /dev/null
+++ b/paddle/framework/data_transform.h
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "paddle/framework/op_kernel_type.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/framework/variable.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+
+using DataTransformFn =
+    std::function<void(const std::vector<platform::DeviceContext*> ctx,
+                       const Variable& in, Variable* out)>;
+using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
+
+struct KernelTypePairHash {
+  static void HashCombine(const OpKernelType& t, std::size_t* seed) {
+    OpKernelType::Hash kernel_type_hasher;
+    (*seed) ^= kernel_type_hasher(t) + 0x9e3779b9 + (*seed << 6) + (*seed >> 2);
+  }
+
+  size_t operator()(const KernelTypePair& kernel_pair) const {
+    std::size_t seed = 0;
+    HashCombine(kernel_pair.first, &seed);
+    HashCombine(kernel_pair.second, &seed);
+    return seed;
+  }
+};
+
+using DataTransformMap =
+    std::unordered_map<KernelTypePair, DataTransformFn, KernelTypePairHash>;
+
+class DataTransformFnMap {
+ public:
+  static DataTransformFnMap& Instance();
+
+  bool Has(const KernelTypePair& key_pair) const {
+    return map_.find(key_pair) != map_.end();
+  }
+
+  void Insert(const OpKernelType& left, const OpKernelType& right,
+              const DataTransformFn& data_tranform_fn) {
+    Insert(std::make_pair(left, right), data_tranform_fn);
+  }
+
+  void Insert(const KernelTypePair& kernel_type_pair,
+              const DataTransformFn& data_tranform_fn) {
+    PADDLE_ENFORCE(!Has(kernel_type_pair),
+                   "KernelTypePair %s has been registered", "");
+    map_.insert({kernel_type_pair, data_tranform_fn});
+  }
+
+  const DataTransformFn& Get(const KernelTypePair& key_pair) const {
+    auto data_transformer = GetNullable(key_pair);
+    PADDLE_ENFORCE_NOT_NULL(data_transformer,
+                            "DataTransformFn should not be NULL");
+    return *data_transformer;
+  }
+
+  const DataTransformFn* GetNullable(const KernelTypePair& key_pair) const {
+    auto it = map_.find(key_pair);
+    if (it == map_.end()) {
+      return nullptr;
+    } else {
+      return &(it->second);
+    }
+  }
+
+  const DataTransformMap& Map() const { return map_; }
+
+ private:
+  DataTransformFnMap() = default;
+  DataTransformMap map_;
+  DISABLE_COPY_AND_ASSIGN(DataTransformFnMap);
+};
+
+// generate unique name with __LINE__
+// refs https://stackoverflow.com/questions/1597007
+#define TOKENPASTE(x, y) x##y
+#define TOKENPASTE2(x, y) TOKENPASTE(x, y)
+#define REGISTER_DATA_TRANSFORM_FN(from, to, fn)                              \
+  static int TOKENPASTE2(fn_, __LINE__)() {                                   \
+    ::paddle::framework::DataTransformFnMap::Instance().Insert(from, to, fn); \
+    return 0;                                                                 \
+  }                                                                           \
+  static int TOKENPASTE2(var_, __LINE__) __attribute__((unused)) =            \
+      TOKENPASTE2(fn_, __LINE__)()
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_transform_test.cc b/paddle/framework/data_transform_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e2141ecd2ebe35402a8a04613702a2f79f6a179
--- /dev/null
+++ b/paddle/framework/data_transform_test.cc
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <array>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "paddle/framework/data_transform.h"
+
+namespace paddle {
+namespace framework {
+using namespace platform;
+
+/**
+ * @brief cross validation of different kernel type transform
+ *  We use four bit map represent different combination.
+ *  If the field has multiple possible value, only choose two of them.
+ *  For DataType, only test the FP32(float), FP64(double).
+ *  e.g. 0000 -> FP32, CPUPlace, kNHWC, kPlain
+ *       1111 -> FP64, GPUPlace, kNCHW, kMKLDNN
+ */
+
+std::array<proto::DataType, 2> kDataType = {
+    {proto::DataType::FP32, proto::DataType::FP64}};
+
+std::array<Place, 2> kPlace = {{CPUPlace(), CUDAPlace(0)}};
+
+std::array<DataLayout, 2> kDataLayout = {
+    {DataLayout::kNHWC, DataLayout::kNCHW}};
+
+std::array<LibraryType, 2> kLibraryType = {
+    {LibraryType::kPlain, LibraryType::kMKLDNN}};
+
+OpKernelType GenFromBit(const std::vector<bool> bits) {
+  return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]],
+                      kLibraryType[bits[3]]);
+}
+
+int test_value = 0;
+
+auto kernel0 = GenFromBit({0, 0, 0, 0});
+auto kernel1 = GenFromBit({0, 0, 0, 1});
+auto kernel2 = GenFromBit({0, 0, 1, 0});
+auto kernel3 = GenFromBit({0, 0, 1, 1});
+
+void TransDataType_t(std::vector<platform::DeviceContext*> ctx,
+                     const Variable& in, Variable* out) {
+  test_value++;
+}
+
+void TransDataLayout_t(std::vector<platform::DeviceContext*> ctx,
+                       const Variable& in, Variable* out) {
+  test_value--;
+}
+
+void TransLibraryType_t(std::vector<platform::DeviceContext*> ctx,
+                        const Variable& in, Variable* out) {
+  test_value += 2;
+}
+
+}  // namespace framework
+}  // namespace paddle
+
+namespace frw = paddle::framework;
+
+REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel1, frw::TransDataType_t);
+REGISTER_DATA_TRANSFORM_FN(frw::kernel1, frw::kernel2, frw::TransDataLayout_t);
+REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel2, frw::TransLibraryType_t);
+
+TEST(DataTransform, Register) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  auto& instance = DataTransformFnMap::Instance();
+  std::vector<DeviceContext*> ctx;
+  paddle::framework::Variable in;
+  paddle::framework::Variable out;
+
+  instance.Get(std::make_pair(frw::kernel0, frw::kernel1))(ctx, in, &out);
+  ASSERT_EQ(test_value, 1);
+
+  instance.Get(std::make_pair(frw::kernel1, frw::kernel2))(ctx, in, &out);
+  ASSERT_EQ(test_value, 0);
+
+  instance.Get(std::make_pair(frw::kernel0, frw::kernel2))(ctx, in, &out);
+  ASSERT_EQ(test_value, 2);
+}
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index c54d2d4ddf09c445fb25c1fbe8a7498f233d8212..6a372ac32e48131eed28e2d42125feb5b92a11c7 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <typeindex>
@@ -20,7 +20,8 @@
 namespace paddle {
 namespace framework {
 
-inline DataType ToDataType(std::type_index type) {
+inline proto::DataType ToDataType(std::type_index type) {
+  using namespace paddle::framework::proto;
   if (typeid(float).hash_code() == type.hash_code()) {
     return DataType::FP32;
   } else if (typeid(double).hash_code() == type.hash_code()) {
@@ -36,7 +37,8 @@ inline DataType ToDataType(std::type_index type) {
   }
 }
 
-inline std::type_index ToTypeIndex(DataType type) {
+inline std::type_index ToTypeIndex(proto::DataType type) {
+  using namespace paddle::framework::proto;
   switch (type) {
     case DataType::FP32:
       return typeid(float);
@@ -54,7 +56,8 @@ inline std::type_index ToTypeIndex(DataType type) {
 }
 
 template <typename Visitor>
-inline void VisitDataType(DataType type, Visitor visitor) {
+inline void VisitDataType(proto::DataType type, Visitor visitor) {
+  using namespace paddle::framework::proto;
   switch (type) {
     case DataType::FP32:
       visitor.template operator()<float>();
diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc
index 756232b1b56a49d2c91cc2cac950ca508c54fb3f..bc259d1f603fb34ac8546c388669d8c5c1250bd1 100644
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -1,3 +1,16 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <sstream>
 #include <vector>
 
diff --git a/paddle/framework/details/op_registry.h b/paddle/framework/details/op_registry.h
index f91e0e03410c95f84a65f02beed38b7bbfdcaa86..6d50e820b2b625f932768d2ca671d999071f1ca6 100644
--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -90,7 +90,7 @@ struct OpInfoFiller<T, kOperator> {
 template <typename T>
 struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
   void operator()(const char* op_type, OpInfo* info) const {
-    info->proto_ = new OpProto;
+    info->proto_ = new proto::OpProto;
     info->checker_ = new OpAttrChecker();
     auto maker = T(info->proto_, info->checker_);
     maker.Validate();
@@ -106,10 +106,10 @@ template <typename T>
 struct OpInfoFiller<T, kGradOpDescMaker> {
   void operator()(const char* op_type, OpInfo* info) const {
     info->grad_op_maker_ = [](
-        const OpDescBind& fwd_op,
+        const OpDesc& fwd_op,
         const std::unordered_set<std::string>& no_grad_set,
         std::unordered_map<std::string, std::string>* grad_to_var,
-        const std::vector<BlockDescBind*>& grad_block) {
+        const std::vector<BlockDesc*>& grad_block) {
       T maker(fwd_op, no_grad_set, grad_to_var, grad_block);
       return maker();
     };
@@ -119,7 +119,7 @@ struct OpInfoFiller<T, kGradOpDescMaker> {
 template <typename T>
 struct OpInfoFiller<T, kVarTypeInference> {
   void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_var_type_ = [](const OpDescBind& fwd_op, BlockDescBind* block) {
+    info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) {
       T inference;
       inference(fwd_op, block);
     };
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 83aa927c293676c3800ed945c175e4f3dc5629d6..31749743a58835ee2209d5a448f28b011cb3f7af 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -33,48 +33,22 @@ namespace framework {
 const std::string kFeedOpType = "feed";
 const std::string kFetchOpType = "fetch";
 
-Executor::Executor(const std::vector<platform::Place>& places) : own_(true) {
-  PADDLE_ENFORCE_GT(places.size(), 0);
-  device_contexts_.resize(places.size());
-  for (size_t i = 0; i < places.size(); i++) {
-    if (platform::is_cpu_place(places[i])) {
-      device_contexts_[i] = new platform::CPUDeviceContext(
-          boost::get<platform::CPUPlace>(places[i]));
-    } else if (platform::is_gpu_place(places[i])) {
-#ifdef PADDLE_WITH_CUDA
-      device_contexts_[i] = new platform::CUDADeviceContext(
-          boost::get<platform::GPUPlace>(places[i]));
-#else
-      PADDLE_THROW(
-          "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
-          "option");
-#endif
-    }
-  }
-}
-
-Executor::~Executor() {
-  if (own_) {
-    for (auto& device_context : device_contexts_) {
-      delete device_context;
-    }
-  }
-}
+Executor::Executor(const platform::Place& place) : place_(place) {}
 
-static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
-  if (var_type == VarDesc::LOD_TENSOR) {
+static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
+  if (var_type == proto::VarDesc::LOD_TENSOR) {
     var->GetMutable<LoDTensor>();
-  } else if (var_type == VarDesc::SELECTED_ROWS) {
+  } else if (var_type == proto::VarDesc::SELECTED_ROWS) {
     var->GetMutable<SelectedRows>();
-  } else if (var_type == VarDesc::FEED_MINIBATCH) {
+  } else if (var_type == proto::VarDesc::FEED_MINIBATCH) {
     var->GetMutable<FeedFetchList>();
-  } else if (var_type == VarDesc::FETCH_LIST) {
+  } else if (var_type == proto::VarDesc::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
-  } else if (var_type == VarDesc::STEP_SCOPES) {
+  } else if (var_type == proto::VarDesc::STEP_SCOPES) {
     var->GetMutable<std::vector<framework::Scope>>();
-  } else if (var_type == VarDesc::LOD_RANK_TABLE) {
+  } else if (var_type == proto::VarDesc::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
-  } else if (var_type == VarDesc::LOD_TENSOR_ARRAY) {
+  } else if (var_type == proto::VarDesc::LOD_TENSOR_ARRAY) {
     var->GetMutable<LoDTensorArray>();
   } else {
     PADDLE_THROW(
@@ -84,56 +58,54 @@ static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
   }
 }
 
-void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
-                   bool create_local_scope) {
+void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
+                   bool create_local_scope, bool create_vars) {
   // TODO(tonyyang-svail):
   //    - only runs on the first device (i.e. no interdevice communication)
   //    - will change to use multiple blocks for RNN op and Cond Op
   PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), pdesc.Size());
   auto& block = pdesc.Block(block_id);
-  auto& device = device_contexts_[0];
 
   Scope* local_scope = scope;
-  if (create_local_scope) {
-    local_scope = &scope->NewScope();
-    for (auto& var : block.AllVars()) {
-      if (var->Name() == framework::kEmptyVarName) {
-        continue;
+  if (create_vars) {
+    if (create_local_scope) {
+      local_scope = &scope->NewScope();
+      for (auto& var : block.AllVars()) {
+        if (var->Name() == framework::kEmptyVarName) {
+          continue;
+        }
+
+        if (var->Persistable()) {
+          auto* ptr = scope->Var(var->Name());
+          CreateTensor(ptr, var->GetType());
+          VLOG(3) << "Create Variable " << var->Name()
+                  << " global, which pointer is " << ptr;
+        } else {
+          auto* ptr = local_scope->Var(var->Name());
+          CreateTensor(ptr, var->GetType());
+          VLOG(3) << "Create Variable " << var->Name()
+                  << " locally, which pointer is " << ptr;
+        }
       }
-
-      if (var->Persistable()) {
-        auto* ptr = scope->Var(var->Name());
-        CreateTensor(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " global, which pointer is " << ptr;
-      } else {
+    } else {
+      for (auto& var : block.AllVars()) {
         auto* ptr = local_scope->Var(var->Name());
         CreateTensor(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " locally, which pointer is " << ptr;
+        VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
+                << ptr;
       }
-    }
-  } else {
-    for (auto& var : block.AllVars()) {
-      auto* ptr = local_scope->Var(var->Name());
-      CreateTensor(ptr, var->GetType());
-      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
-              << ptr;
-    }
-  }
+    }  // if (create_local_scope)
+  }    // if (create_vars)
 
   for (auto& op_desc : block.AllOps()) {
     auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
     VLOG(3) << op->DebugString();
-    op->Run(*local_scope, *device);
+    op->Run(*local_scope, place_);
   }
-  if (create_local_scope) {
+  if (create_vars && create_local_scope) {
     scope->DeleteScope(local_scope);
   }
 }
 
-Executor::Executor(const platform::DeviceContext& device)
-    : device_contexts_({&device}), own_(false) {}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index b745f4f6474ef688774f4c833a3958942e9aa8cb..d869e18901b82959a40cc296aa0844c20ea63ac1 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -18,15 +18,18 @@ limitations under the License. */
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
 
 class Executor {
  public:
-  explicit Executor(const std::vector<platform::Place>& places);
-  explicit Executor(const platform::DeviceContext& devices);
-  ~Executor();
+  // TODO(dzhwinter) : Do not rely on this function, it will be removed
+  explicit Executor(const platform::DeviceContext& device)
+      : Executor(device.GetPlace()) {}
+
+  explicit Executor(const platform::Place& place);
 
   /* @Brief
    * Runtime evaluation of the given ProgramDesc under certain Scope
@@ -35,11 +38,11 @@ class Executor {
    *  ProgramDesc
    *  Scope
    */
-  void Run(const ProgramDescBind&, Scope*, int, bool create_local_scope = true);
+  void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true,
+           bool create_vars = true);
 
  private:
-  std::vector<const platform::DeviceContext*> device_contexts_;
-  bool own_;
+  const platform::Place place_;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/feed_fetch_type.h b/paddle/framework/feed_fetch_type.h
index bc4ae440fc708f696c18bb9d5ab3ba7dd59e21ab..9bc4a90c44828ecb7458d524f59609f01848cc5c 100644
--- a/paddle/framework/feed_fetch_type.h
+++ b/paddle/framework/feed_fetch_type.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <vector>
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index f1fc4529e15502927560eefd74110f6ca7eab4a9..4f2746e4b86ee5fe095897ff6ef9d3f6473e8a14 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -14,7 +14,7 @@ limitations under the License. */
 
 syntax = "proto2";
 option optimize_for = LITE_RUNTIME;
-package paddle.framework;
+package paddle.framework.proto;
 
 enum AttrType {
   INT = 0;
diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h
index 998186e33915a11f2864eb5387d19ed1bfbab51c..2de5242831835b47893a5825e5532500ad5ec3f9 100644
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <string>
@@ -22,21 +22,27 @@
 namespace paddle {
 namespace framework {
 
+/*
+  This functor class is responsible for creating the gradient ops for the given
+  operator fwd_op. After it is called (through operator()), the pairs of
+  (gradient variable, corresponding input variable of fwd_op) will be added to
+  grad_to_var. If an input variable of fwd_op is contained in no_grad_set, its
+  gradient varialbe will be ignored or kEmptyVarName depending on the template
+  argument DropEmptyIG in the derived classes.
+ */
 class GradOpDescMakerBase {
  public:
   explicit GradOpDescMakerBase(
-      const OpDescBind& fwd_op,
-      const std::unordered_set<std::string>& no_grad_set,
+      const OpDesc& fwd_op, const std::unordered_set<std::string>& no_grad_set,
       std::unordered_map<std::string, std::string>* grad_to_var,
-      const std::vector<BlockDescBind*>& grad_block =
-          std::vector<BlockDescBind*>())
+      const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>())
       : fwd_op_(fwd_op),
         no_grad_set_(no_grad_set),
         grad_to_var_(grad_to_var),
         grad_block_(grad_block) {}
 
   virtual ~GradOpDescMakerBase() = default;
-  virtual std::vector<std::unique_ptr<OpDescBind>> operator()() const = 0;
+  virtual std::vector<std::unique_ptr<OpDesc>> operator()() const = 0;
 
  protected:
   std::vector<std::string> InputGrad(const std::string& name,
@@ -58,6 +64,16 @@ class GradOpDescMakerBase {
     if (!drop_empty_grad) {
       return ret_val;
     }
+    PADDLE_ENFORCE_LE(var_names.size(), 1UL,
+                      "BUG from operator developer:"
+                      " for input argument with a list of variables, "
+                      " drop_empty_grad is not allowed because it makes"
+                      " the correspondence bewteen a variable and its gradient"
+                      " ambiguous. Use REGISTER_OP_EX to register the op"
+                      " or call InputGrad(?,false) in GradOpDescMaker."
+                      " Op type %s",
+                      fwd_op_.Type());
+
     std::vector<std::string> dropped_ret_val;
     dropped_ret_val.reserve(ret_val.size());
     std::copy_if(ret_val.begin(), ret_val.end(),
@@ -105,26 +121,26 @@ class GradOpDescMakerBase {
   std::string ForwardOpType() const { return this->fwd_op_.Type(); }
 
  private:
-  const OpDescBind& fwd_op_;
+  const OpDesc& fwd_op_;
   const std::unordered_set<std::string>& no_grad_set_;
   std::unordered_map<std::string, std::string>* grad_to_var_;
 
  protected:
-  std::vector<BlockDescBind*> grad_block_;
+  std::vector<BlockDesc*> grad_block_;
 };
 
 class SingleGradOpDescMaker : public GradOpDescMakerBase {
  public:
   using GradOpDescMakerBase::GradOpDescMakerBase;
 
-  std::vector<std::unique_ptr<OpDescBind>> operator()() const {
-    std::vector<std::unique_ptr<OpDescBind>> retv;
+  std::vector<std::unique_ptr<OpDesc>> operator()() const {
+    std::vector<std::unique_ptr<OpDesc>> retv;
     retv.emplace_back(this->Apply());
     return retv;
   }
 
  protected:
-  virtual std::unique_ptr<OpDescBind> Apply() const = 0;
+  virtual std::unique_ptr<OpDesc> Apply() const = 0;
 };
 
 template <bool DropEmptyIG = true>
@@ -133,8 +149,8 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
   using SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  virtual std::unique_ptr<OpDescBind> Apply() const {
-    auto* grad = new OpDescBind();
+  virtual std::unique_ptr<OpDesc> Apply() const {
+    auto* grad = new OpDesc();
     grad->SetType(this->GradOpType());
 
     for (auto& input_param : this->InputNames()) {
@@ -150,7 +166,7 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
 
     grad->SetAttrMap(this->Attrs());
 
-    return std::unique_ptr<OpDescBind>(grad);
+    return std::unique_ptr<OpDesc>(grad);
   }
 
   virtual std::string GradOpType() const {
@@ -161,7 +177,7 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
 class EmptyGradOpMaker : public GradOpDescMakerBase {
  public:
   using GradOpDescMakerBase::GradOpDescMakerBase;
-  std::vector<std::unique_ptr<OpDescBind>> operator()() const override {
+  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
     return {};
   }
 };
diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc
new file mode 100644
index 0000000000000000000000000000000000000000..682cff168d4d31e0565fc987604f97a671566fbd
--- /dev/null
+++ b/paddle/framework/init.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <algorithm>
+#include <string>
+
+#include "paddle/framework/init.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+#include "paddle/string/piece.h"
+
+namespace paddle {
+namespace framework {
+
+std::once_flag gflags_init_flag;
+
+// TODO(qijun) move init gflags to init.cc
+void InitGflags(std::vector<std::string> &argv) {
+  std::call_once(gflags_init_flag, [&]() {
+    int argc = argv.size();
+    char **arr = new char *[argv.size()];
+    std::string line;
+    for (size_t i = 0; i < argv.size(); i++) {
+      arr[i] = &argv[i][0];
+      line += argv[i];
+      line += ' ';
+    }
+    google::ParseCommandLineFlags(&argc, &arr, true);
+    VLOG(1) << "Init commandline: " << line;
+  });
+}
+
+bool InitDevices(const std::vector<std::string> &devices) {
+  // device format
+  // CPU
+  // GPU:1
+  // TODO(dzhwinter) : add device format annotation for users.
+  std::vector<platform::Place> places;
+  for (auto &device : devices) {
+    auto p = string::Piece(device);
+    if (string::HasPrefix(p, "CPU")) {
+      places.emplace_back(platform::CPUPlace());
+    } else if (string::HasPrefix(p, "GPU")) {
+#ifdef PADDLE_WITH_CUDA
+      auto pos = string::RFind(p, ':', string::Piece::npos);
+      auto number = device.substr(pos + 1);
+      places.emplace_back(platform::CUDAPlace(std::stoi(number)));
+#else
+      LOG(WARNING)
+          << "'GPU' is not supported, Please re-compile with WITH_GPU option";
+#endif
+    } else {
+      return false;
+    }
+  }
+
+  if (std::find_if(places.begin(), places.end(),
+                   [&](const platform::Place &place) {
+                     return platform::is_cpu_place(place);
+                   }) == places.end()) {
+    places.emplace_back(platform::CPUPlace());
+    LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
+  }
+  platform::DeviceContextPool::Init(places);
+  return true;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/init.h b/paddle/framework/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..33907f9eb00fb3469b53dcf8151557cc7a2d3791
--- /dev/null
+++ b/paddle/framework/init.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <mutex>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+namespace paddle {
+namespace framework {
+
+void InitGflags(std::vector<std::string> &argv);
+
+bool InitDevices(const std::vector<std::string> &devices);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/init_test.cc b/paddle/framework/init_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f0788051d4855a175d2d7ea1f1a0805c776c462b
--- /dev/null
+++ b/paddle/framework/init_test.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
+
+TEST(Init, InitDevices) {
+  using paddle::framework::InitDevices;
+  std::vector<std::string> ds1 = {"CPU"};
+  ASSERT_EQ(InitDevices(ds1), true);
+
+#ifdef PADDLE_WITH_CUDA
+  std::vector<std::string> ds2 = {"CPU", "GPU:0", "GPU:1"};
+  ASSERT_EQ(InitDevices(ds2), true);
+
+  // test re-init
+  std::vector<std::string> ds3 = {"GPU:0", "GPU:1"};
+  ASSERT_EQ(InitDevices(ds3), true);
+#endif
+}
diff --git a/paddle/framework/library_type.h b/paddle/framework/library_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..7707799cae8c4edc304cd81725270a85f01fd28d
--- /dev/null
+++ b/paddle/framework/library_type.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+
+// For more details about the design of LibraryType, Please refer to
+// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library
+
+enum class LibraryType {
+  kPlain = 0,
+  kMKLDNN = 1,
+  kCUDNN = 2,
+};
+
+inline std::string LibraryTypeToString(const LibraryType& library_type) {
+  switch (library_type) {
+    case LibraryType::kPlain:
+      return "PLAIN";
+    case LibraryType::kMKLDNN:
+      return "MKLDNN";
+    case LibraryType::kCUDNN:
+      return "CUDNN";
+    default:
+      PADDLE_THROW("unknown LibraryType %d", static_cast<int>(library_type));
+  }
+}
+
+inline LibraryType StringToLibraryType(const char* ctype) {
+  std::string s(ctype);
+  if (s == std::string("PLAIN")) {
+    return LibraryType::kPlain;
+  } else if (s == std::string("MKLDNN")) {
+    return LibraryType::kMKLDNN;
+  } else if (s == std::string("CUDNN")) {
+    return LibraryType::kCUDNN;
+    // To be compatible with register macro.
+    // CPU, CUDA, PLAIN are same library type.
+  } else if (s == std::string("CPU")) {
+    return LibraryType::kPlain;
+  } else if (s == std::string("CUDA")) {
+    return LibraryType::kPlain;
+  } else {
+    PADDLE_THROW("Unknown LibraryType %s", s.c_str());
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& out, LibraryType l) {
+  out << LibraryTypeToString(l);
+  return out;
+}
+
+}  // namespace
+}  // framework
diff --git a/paddle/framework/lod_rank_table.cc b/paddle/framework/lod_rank_table.cc
index 1c2fba70c8ab0827ba6d1563f08cd0820650822e..704bce2a0eb60b974efd41a4edda0af2933da825 100644
--- a/paddle/framework/lod_rank_table.cc
+++ b/paddle/framework/lod_rank_table.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/lod_rank_table.h"
 
@@ -46,4 +46,13 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
 }
 
 }  // namespace framework
+
+std::ostream& operator<<(std::ostream& out,
+                         const framework::LoDRankTable& table) {
+  out << "NumOfSequence " << table.items().size() << "\n";
+  for (auto& each_item : table.items()) {
+    out << "\tSeq #" << each_item.index << ", Len=" << each_item.length << "\n";
+  }
+  return out;
+}
 }  // namespace paddle
diff --git a/paddle/framework/lod_rank_table.h b/paddle/framework/lod_rank_table.h
index 9faa3a4d7bdc55ab7b24e31f5e5434dacc0a4b36..df188709e91871ded0258fa5703ee16a5664f057 100644
--- a/paddle/framework/lod_rank_table.h
+++ b/paddle/framework/lod_rank_table.h
@@ -1,18 +1,19 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
+#include <iosfwd>
 #include "paddle/framework/lod_tensor.h"
 
 namespace paddle {
@@ -52,4 +53,8 @@ class LoDRankTable {
 };
 
 }  // namespace framework
+
+std::ostream& operator<<(std::ostream& out,
+                         const framework::LoDRankTable& table);
+
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index fdf6de4babff3bb3c253aaf516636882237e6faf..f8a3be9a82bdbaf82550634d36122eb7bbe85e54 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/data_type.h"
@@ -197,7 +197,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
   {  // the 2nd field, tensor description
      // int32_t  size
      // void*    protobuf message
-    framework::TensorDesc desc;
+    proto::TensorDesc desc;
     desc.set_data_type(framework::ToDataType(tensor.type()));
     auto dims = framework::vectorize(tensor.dims());
     auto *pb_dims = desc.mutable_dims();
@@ -224,7 +224,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
       while (size != 0) {
         size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
         memory::Copy(cpu, buf.get(),
-                     boost::get<platform::GPUPlace>(tensor.place()),
+                     boost::get<platform::CUDAPlace>(tensor.place()),
                      reinterpret_cast<const void *>(data), size_to_write,
                      gpu_dev_ctx.stream());
         gpu_dev_ctx.Wait();
@@ -262,7 +262,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
   uint32_t version;
   is.read(reinterpret_cast<char *>(&version), sizeof(version));
   PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-  framework::TensorDesc desc;
+  proto::TensorDesc desc;
   {  // int32_t size
      // proto buffer
     int32_t size;
@@ -281,16 +281,16 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
     void *buf;
     platform::Place cpu = platform::CPUPlace();
     switch (desc.data_type()) {
-      case framework::FP32:
+      case proto::FP32:
         buf = tensor->mutable_data<float>(cpu);
         break;
-      case framework::FP64:
+      case proto::FP64:
         buf = tensor->mutable_data<double>(cpu);
         break;
-      case framework::INT32:
+      case proto::INT32:
         buf = tensor->mutable_data<int>(cpu);
         break;
-      case framework::INT64:
+      case proto::INT64:
         buf = tensor->mutable_data<int64_t>(cpu);
         break;
       default:
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 9411c96aea4c10ebf921cc3e3b442769c8acbefa..147db3ab0877662d9e47ae7ee6df05638b5fcbd1 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -184,6 +184,18 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
   return tensor;
 }
 
+// Get the absolute offset of a lod[start_level][start_idx:end_idx] and
+// relative length of details for every levels(i.e., [start_level: ]).
+//
+// For example,
+//   lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]]
+//   start_level = 0
+//   start_idx = 1
+//   end_idx = 3
+//
+// Returns:
+//  LoD = [[1, 4], [2, 4, 2, 3, 2]]
+//  pair<size_t, size_t> = {11, 24}
 std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
     const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);
 
diff --git a/paddle/framework/lod_tensor_array.h b/paddle/framework/lod_tensor_array.h
index 13f0608d24be97d8bba149b74f1a4deb57deeb48..4a8e7f4fa540b1c2f19a6e3ec236a0dd5c0daf0b 100644
--- a/paddle/framework/lod_tensor_array.h
+++ b/paddle/framework/lod_tensor_array.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <vector>
diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu
index 5b90fbfca7f6bec4f2c862d0ff18dfd7cf39e181..e8508ad2658ae850e4c98aa798b5db6d007e67d0 100644
--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
@@ -27,7 +27,7 @@ __global__ void test(size_t* a, int size) {
 
 TEST(LoDTensor, LoDInGPU) {
   paddle::framework::LoDTensor lod_tensor;
-  paddle::platform::GPUPlace place(0);
+  paddle::platform::CUDAPlace place(0);
 
   paddle::framework::LoD src_lod;
   src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14});
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 7ba1e3e4e3270f4cd88e41e245f24c3cfc8aaab7..781bbb4c19f1c610df485c3061ca8b510e727019 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class OpDescBind;
-class BlockDescBind;
+class OpDesc;
+class BlockDesc;
 class CompileTimeInferShapeContext : public InferShapeContext {
  public:
-  CompileTimeInferShapeContext(const OpDescBind &op,
-                               const BlockDescBind &block);
+  CompileTimeInferShapeContext(const OpDesc &op, const BlockDesc &block);
 
   bool HasInput(const std::string &name) const override;
 
@@ -58,11 +57,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     PADDLE_ENFORCE_LT(j, Outputs(out).size());
     auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
     auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
-    if (in_var->GetType() != VarDesc::LOD_TENSOR) {
+    if (in_var->GetType() != proto::VarDesc::LOD_TENSOR) {
       VLOG(3) << "input " << in << " is not LodTensor";
       return;
     }
-    PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
+    PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarDesc::LOD_TENSOR,
                       "The %d-th output of Output(%s) must be LoDTensor.", j,
                       out);
     out_var->SetLoDLevel(in_var->GetLodLevel());
@@ -70,19 +69,18 @@ class CompileTimeInferShapeContext : public InferShapeContext {
   bool IsRuntime() const override;
 
  protected:
-  VarDesc::VarType GetVarType(const std::string &name) const override;
+  proto::VarDesc::VarType GetVarType(const std::string &name) const override;
 
   DDim GetDim(const std::string &name) const override;
 
   void SetDim(const std::string &name, const DDim &dim) override;
 
-  const OpDescBind &op_;
-  const BlockDescBind &block_;
+  const OpDesc &op_;
+  const BlockDesc &block_;
 };
 
-OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const AttributeMap &attrs) {
+OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
+               const VariableNameMap &outputs, const AttributeMap &attrs) {
   desc_.set_type(type);
   inputs_ = inputs;
   outputs_ = outputs;
@@ -90,12 +88,20 @@ OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
   need_update_ = true;
 }
 
-OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
+void OpDesc::CopyFrom(const OpDesc &op_desc) {
+  desc_.set_type(op_desc.Type());
+  inputs_ = op_desc.inputs_;
+  outputs_ = op_desc.outputs_;
+  attrs_ = op_desc.attrs_;
+  need_update_ = true;
+}
+
+OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
     : desc_(desc), need_update_(false) {
   // restore inputs_
   int input_size = desc_.inputs_size();
   for (int i = 0; i < input_size; ++i) {
-    const OpDesc::Var &var = desc_.inputs(i);
+    const proto::OpDesc::Var &var = desc_.inputs(i);
     std::vector<std::string> &args = inputs_[var.parameter()];
     int argu_size = var.arguments_size();
     args.reserve(argu_size);
@@ -106,7 +112,7 @@ OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
   // restore outputs_
   int output_size = desc_.outputs_size();
   for (int i = 0; i < output_size; ++i) {
-    const OpDesc::Var &var = desc_.outputs(i);
+    const proto::OpDesc::Var &var = desc_.outputs(i);
     std::vector<std::string> &args = outputs_[var.parameter()];
     int argu_size = var.arguments_size();
     args.reserve(argu_size);
@@ -115,9 +121,9 @@ OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
     }
   }
   // restore attrs_
-  for (const OpDesc::Attr &attr : desc_.attrs()) {
+  for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
     std::string attr_name = attr.name();
-    if (attr.type() != AttrType::BLOCK) {
+    if (attr.type() != proto::AttrType::BLOCK) {
       attrs_[attr_name] = GetAttrValue(attr);
     } else {
       auto bid = attr.block_idx();
@@ -126,20 +132,19 @@ OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
   }
 }
 
-OpDesc *OpDescBind::Proto() {
+proto::OpDesc *OpDesc::Proto() {
   Flush();
   return &desc_;
 }
 
-const std::vector<std::string> &OpDescBind::Input(
-    const std::string &name) const {
+const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
   auto it = inputs_.find(name);
   PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name,
                  Type());
   return it->second;
 }
 
-std::vector<std::string> OpDescBind::InputArgumentNames() const {
+std::vector<std::string> OpDesc::InputArgumentNames() const {
   std::vector<std::string> retv;
   for (auto &ipt : this->inputs_) {
     retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
@@ -147,21 +152,20 @@ std::vector<std::string> OpDescBind::InputArgumentNames() const {
   return retv;
 }
 
-void OpDescBind::SetInput(const std::string &param_name,
-                          const std::vector<std::string> &args) {
+void OpDesc::SetInput(const std::string &param_name,
+                      const std::vector<std::string> &args) {
   need_update_ = true;
   inputs_[param_name] = args;
 }
 
-const std::vector<std::string> &OpDescBind::Output(
-    const std::string &name) const {
+const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
   auto it = outputs_.find(name);
   PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s",
                  name, Type());
   return it->second;
 }
 
-std::vector<std::string> OpDescBind::OutputArgumentNames() const {
+std::vector<std::string> OpDesc::OutputArgumentNames() const {
   std::vector<std::string> retv;
   for (auto &ipt : this->outputs_) {
     retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
@@ -169,19 +173,19 @@ std::vector<std::string> OpDescBind::OutputArgumentNames() const {
   return retv;
 }
 
-void OpDescBind::SetOutput(const std::string &param_name,
-                           const std::vector<std::string> &args) {
+void OpDesc::SetOutput(const std::string &param_name,
+                       const std::vector<std::string> &args) {
   need_update_ = true;
   this->outputs_[param_name] = args;
 }
 
-AttrType OpDescBind::GetAttrType(const std::string &name) const {
+proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
   auto it = attrs_.find(name);
   PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-  return static_cast<AttrType>(it->second.which() - 1);
+  return static_cast<proto::AttrType>(it->second.which() - 1);
 }
 
-std::vector<std::string> OpDescBind::AttrNames() const {
+std::vector<std::string> OpDesc::AttrNames() const {
   std::vector<std::string> retv;
   retv.reserve(attrs_.size());
   for (auto &attr : attrs_) {
@@ -190,41 +194,39 @@ std::vector<std::string> OpDescBind::AttrNames() const {
   return retv;
 }
 
-void OpDescBind::SetAttr(const std::string &name, const Attribute &v) {
+void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
   this->attrs_[name] = v;
   need_update_ = true;
 }
 
-void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) {
+void OpDesc::SetBlockAttr(const std::string &name, BlockDesc &block) {
   this->attrs_[name] = &block;
   need_update_ = true;
 }
 
-void OpDescBind::SetAttrMap(
+void OpDesc::SetAttrMap(
     const std::unordered_map<std::string, Attribute> &attr_map) {
   attrs_ = attr_map;
   need_update_ = true;
 }
 
-Attribute OpDescBind::GetAttr(const std::string &name) const {
+Attribute OpDesc::GetAttr(const std::string &name) const {
   auto it = attrs_.find(name);
   PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
   return it->second;
 }
 
-int OpDescBind::GetBlockAttr(const std::string &name) const {
+int OpDesc::GetBlockAttr(const std::string &name) const {
   auto it = attrs_.find(name);
   PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-  return boost::get<BlockDescBind *>(it->second)->ID();
+  return boost::get<BlockDesc *>(it->second)->ID();
 }
 
-const std::unordered_map<std::string, Attribute> &OpDescBind::GetAttrMap()
-    const {
+const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
   return attrs_;
 }
 
-void OpDescBind::Rename(const std::string &old_name,
-                        const std::string &new_name) {
+void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
   for (auto &input : inputs_) {
     std::replace(input.second.begin(), input.second.end(), old_name, new_name);
   }
@@ -235,8 +237,8 @@ void OpDescBind::Rename(const std::string &old_name,
   need_update_ = true;
 }
 
-void OpDescBind::RenameOutput(const std::string &old_name,
-                              const std::string &new_name) {
+void OpDesc::RenameOutput(const std::string &old_name,
+                          const std::string &new_name) {
   for (auto &output : outputs_) {
     std::replace(output.second.begin(), output.second.end(), old_name,
                  new_name);
@@ -244,8 +246,8 @@ void OpDescBind::RenameOutput(const std::string &old_name,
   need_update_ = true;
 }
 
-void OpDescBind::RenameInput(const std::string &old_name,
-                             const std::string &new_name) {
+void OpDesc::RenameInput(const std::string &old_name,
+                         const std::string &new_name) {
   for (auto &input : inputs_) {
     std::replace(input.second.begin(), input.second.end(), old_name, new_name);
   }
@@ -253,8 +255,8 @@ void OpDescBind::RenameInput(const std::string &old_name,
 }
 
 struct SetAttrDescVisitor : public boost::static_visitor<void> {
-  explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
-  mutable OpDesc::Attr *attr_;
+  explicit SetAttrDescVisitor(proto::OpDesc::Attr *attr) : attr_(attr) {}
+  mutable proto::OpDesc::Attr *attr_;
   void operator()(int v) const { attr_->set_i(v); }
   void operator()(float v) const { attr_->set_f(v); }
   void operator()(const std::string &v) const { attr_->set_s(v); }
@@ -272,11 +274,13 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
   void operator()(const std::vector<bool> &v) const {
     VectorToRepeated(v, attr_->mutable_bools());
   }
-  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->idx()); }
+  void operator()(proto::BlockDesc *desc) const {
+    attr_->set_block_idx(desc->idx());
+  }
   void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
 };
 
-void OpDescBind::Flush() {
+void OpDesc::Flush() {
   if (need_update_) {
     this->desc_.mutable_inputs()->Clear();
     for (auto &ipt : inputs_) {
@@ -297,7 +301,7 @@ void OpDescBind::Flush() {
       auto *attr_desc = desc_.add_attrs();
       attr_desc->set_name(attr.first);
       attr_desc->set_type(
-          static_cast<framework::AttrType>(attr.second.which() - 1));
+          static_cast<proto::AttrType>(attr.second.which() - 1));
       SetAttrDescVisitor visitor(attr_desc);
       boost::apply_visitor(visitor, attr.second);
     }
@@ -328,7 +332,7 @@ static void InitInferShapeFuncs() {
   });
 }
 
-void OpDescBind::CheckAttrs() {
+void OpDesc::CheckAttrs() {
   PADDLE_ENFORCE(!Type().empty(),
                  "CheckAttr() can not be called before type is setted.");
   auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
@@ -340,7 +344,7 @@ void OpDescBind::CheckAttrs() {
   checker->Check(attrs_);
 }
 
-void OpDescBind::InferShape(const BlockDescBind &block) const {
+void OpDesc::InferShape(const BlockDesc &block) const {
   VLOG(3) << "CompileTime infer shape on " << Type();
   InitInferShapeFuncs();
   auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
@@ -363,7 +367,7 @@ void OpDescBind::InferShape(const BlockDescBind &block) const {
   infer_shape(&ctx);
 }
 
-void OpDescBind::InferVarType(BlockDescBind *block) const {
+void OpDesc::InferVarType(BlockDesc *block) const {
   auto &info = OpInfoMap::Instance().Get(this->Type());
   if (info.infer_var_type_) {
     info.infer_var_type_(*this, block);
@@ -375,14 +379,14 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
     for (auto &out_pair : this->outputs_) {
       for (auto &out_var_name : out_pair.second) {
         block->FindRecursiveOrCreateVar(out_var_name)
-            ->SetType(VarDesc::LOD_TENSOR);
+            ->SetType(proto::VarDesc::LOD_TENSOR);
       }
     }
   }
 }
 
 CompileTimeInferShapeContext::CompileTimeInferShapeContext(
-    const OpDescBind &op, const BlockDescBind &block)
+    const OpDesc &op, const BlockDesc &block)
     : op_(op), block_(block) {}
 
 bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
@@ -484,7 +488,7 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
 }
 bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
 
-VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
+proto::VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
     const std::string &name) const {
   return block_.FindVarRecursive(name)->GetType();
 }
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index da032319afa775571d3942bf6ae415db7d233735..4cf784a0d0d319d09caa27b4e2b589bd7ac4f324 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -23,19 +23,21 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class BlockDescBind;
-class ProgramDescBind;
+class BlockDesc;
+class ProgramDesc;
 
-class OpDescBind {
+class OpDesc {
  public:
-  OpDescBind() {}
+  OpDesc() {}
 
-  OpDescBind(const std::string &type, const VariableNameMap &inputs,
-             const VariableNameMap &outputs, const AttributeMap &attrs);
+  OpDesc(const std::string &type, const VariableNameMap &inputs,
+         const VariableNameMap &outputs, const AttributeMap &attrs);
 
-  OpDescBind(const OpDesc &desc, ProgramDescBind *prog);
+  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog);
 
-  OpDesc *Proto();
+  void CopyFrom(const OpDesc &op_desc);
+
+  proto::OpDesc *Proto();
 
   std::string Type() const { return desc_.type(); }
 
@@ -59,13 +61,13 @@ class OpDescBind {
     return attrs_.find(name) != attrs_.end();
   }
 
-  AttrType GetAttrType(const std::string &name) const;
+  proto::AttrType GetAttrType(const std::string &name) const;
 
   std::vector<std::string> AttrNames() const;
 
   void SetAttr(const std::string &name, const Attribute &v);
 
-  void SetBlockAttr(const std::string &name, BlockDescBind &block);
+  void SetBlockAttr(const std::string &name, BlockDesc &block);
 
   Attribute GetAttr(const std::string &name) const;
 
@@ -107,9 +109,9 @@ class OpDescBind {
 
   void CheckAttrs();
 
-  void InferShape(const BlockDescBind &block) const;
+  void InferShape(const BlockDesc &block) const;
 
-  void InferVarType(BlockDescBind *block) const;
+  void InferVarType(BlockDesc *block) const;
 
   void MarkAsTarget() { desc_.set_is_target(true); }
 
@@ -126,8 +128,10 @@ class OpDescBind {
     return ret_val;
   }
 
-  OpDesc desc_;
+  proto::OpDesc desc_;
+  // input arg name => output variable names
   VariableNameMap inputs_;
+  // output arg name => output variable names
   VariableNameMap outputs_;
   AttributeMap attrs_;
 
diff --git a/paddle/framework/op_info.cc b/paddle/framework/op_info.cc
index 81ba29797c5f478e5d6a91236f3e8de1e6b43e49..b520108109bb2f72b80f83559fa065a5ca58e9e1 100644
--- a/paddle/framework/op_info.cc
+++ b/paddle/framework/op_info.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/op_info.h"
 
diff --git a/paddle/framework/op_info.h b/paddle/framework/op_info.h
index d3b1a3b5fa2cf8f6a9571e92a319f3757666657e..d9b89f9cac9611fcecb18bef87940632df1e2234 100644
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <functional>
@@ -34,7 +34,7 @@ class InferShapeBase {
 struct OpInfo {
   OpCreator creator_;
   GradOpMakerFN grad_op_maker_;
-  OpProto* proto_{nullptr};
+  proto::OpProto* proto_{nullptr};
   OpAttrChecker* checker_{nullptr};
   InferVarTypeFN infer_var_type_;
   InferShapeFN infer_shape_;
@@ -43,7 +43,7 @@ struct OpInfo {
     return proto_ != nullptr && checker_ != nullptr;
   }
 
-  const OpProto& Proto() const {
+  const proto::OpProto& Proto() const {
     PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
     PADDLE_ENFORCE(proto_->IsInitialized(),
                    "Operator Proto must be initialized in op info");
diff --git a/paddle/framework/op_kernel_type.h b/paddle/framework/op_kernel_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..b06002096fb109da806809f7b908d9768cf095ba
--- /dev/null
+++ b/paddle/framework/op_kernel_type.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/data_layout.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/library_type.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+struct OpKernelType {
+  struct Hash {
+    size_t operator()(const OpKernelType& key) const {
+      int place = key.place_.which() + (1 << LEFT_SHIFT);
+      int data_type =
+          static_cast<int>(key.data_type_) + (1 << (LEFT_SHIFT + 1));
+      int data_layout =
+          static_cast<int>(key.data_layout_) + (1 << (LEFT_SHIFT + 2));
+      int library_type =
+          static_cast<int>(key.library_type_) + (1 << (LEFT_SHIFT + 3));
+      std::hash<int> hasher;
+      return hasher(place + data_type + data_layout + library_type);
+    }
+  };
+
+  // place, data_type, library_type kinds less than 2^8
+  constexpr static int LEFT_SHIFT = 8;
+
+  proto::DataType data_type_;
+  DataLayout data_layout_;
+  platform::Place place_;
+  LibraryType library_type_;
+
+  OpKernelType(proto::DataType data_type, platform::Place place,
+               DataLayout data_layout = DataLayout::kAnyLayout,
+               LibraryType library_type = LibraryType::kPlain)
+      : data_type_(data_type),
+        data_layout_(data_layout),
+        place_(place),
+        library_type_(library_type) {}
+
+  OpKernelType(proto::DataType data_type,
+               const platform::DeviceContext& dev_ctx,
+               DataLayout data_layout = DataLayout::kAnyLayout,
+               LibraryType library_type = LibraryType::kPlain)
+      : data_type_(data_type),
+        data_layout_(data_layout),
+        place_(dev_ctx.GetPlace()),
+        library_type_(library_type) {}
+
+  bool operator==(const OpKernelType& o) const {
+    return platform::places_are_same_class(place_, o.place_) &&
+           data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
+           library_type_ == o.library_type_;
+  }
+
+  bool operator!=(const OpKernelType& o) const { return !(*this == o); }
+};
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const OpKernelType& kernel_key) {
+  os << "data_type[" << kernel_key.data_type_ << "]:data_layout["
+     << kernel_key.data_layout_ << "]:place[" << kernel_key.place_
+     << "]:library_type[" << kernel_key.library_type_ << "]";
+  return os;
+}
+
+inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
+  std::ostringstream stream;
+  stream << kernel_key;
+  return stream.str();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_kernel_type_test.cc b/paddle/framework/op_kernel_type_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..649afeee8a846b0579545f2edff77e9dbe3b4dd8
--- /dev/null
+++ b/paddle/framework/op_kernel_type_test.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_kernel_type.h"
+#include <gtest/gtest.h>
+#include <iostream>
+
+TEST(OpKernelType, ToString) {
+  using OpKernelType = paddle::framework::OpKernelType;
+  using DataType = paddle::framework::proto::DataType;
+  using CPUPlace = paddle::platform::CPUPlace;
+  using DataLayout = paddle::framework::DataLayout;
+  using LibraryType = paddle::framework::LibraryType;
+
+  OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
+                              LibraryType::kCUDNN);
+
+  ASSERT_EQ(
+      paddle::framework::KernelTypeToString(op_kernel_type),
+      "data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]");
+}
+
+TEST(OpKernelType, Hash) {
+  using OpKernelType = paddle::framework::OpKernelType;
+  using DataType = paddle::framework::proto::DataType;
+  using CPUPlace = paddle::platform::CPUPlace;
+  using CUDAPlace = paddle::platform::CUDAPlace;
+  using DataLayout = paddle::framework::DataLayout;
+  using LibraryType = paddle::framework::LibraryType;
+
+  OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
+                                LibraryType::kCUDNN);
+  OpKernelType op_kernel_type_2(DataType::FP32, CUDAPlace(0), DataLayout::kNCHW,
+                                LibraryType::kCUDNN);
+
+  OpKernelType::Hash hasher;
+  ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2));
+}
diff --git a/paddle/framework/op_proto_maker.h b/paddle/framework/op_proto_maker.h
index 44e8ab16895cc604f85bb83e240eab55739f8ba0..efd3a5ca535403d8d46a73adc899d914623b53e4 100644
--- a/paddle/framework/op_proto_maker.h
+++ b/paddle/framework/op_proto_maker.h
@@ -22,6 +22,8 @@ namespace framework {
 // this class not only make proto but also init attribute checkers.
 class OpProtoAndCheckerMaker {
  public:
+  using OpProto = proto::OpProto;
+  using OpAttrChecker = framework::OpAttrChecker;
   OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : proto_(proto), op_checker_(op_checker) {}
 
@@ -80,7 +82,7 @@ class OpProtoAndCheckerMaker {
 
 class NOPMaker : public OpProtoAndCheckerMaker {
  public:
-  NOPMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  NOPMaker(OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {}
 };
 
diff --git a/paddle/framework/op_proto_maker_test.cc b/paddle/framework/op_proto_maker_test.cc
index 988a14cf4de8fdf052ca7e8c41bff0c05ba2daaa..f16cb6fa3aa095a6d9737d84c7ce58f385a7072b 100644
--- a/paddle/framework/op_proto_maker_test.cc
+++ b/paddle/framework/op_proto_maker_test.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 
 class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
  public:
-  TestAttrProtoMaker(paddle::framework::OpProto* proto,
+  TestAttrProtoMaker(paddle::framework::proto::OpProto* proto,
                      paddle::framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddAttr<float>("scale", "scale of test op");
@@ -27,7 +27,7 @@ class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };
 
 TEST(ProtoMaker, DuplicatedAttr) {
-  paddle::framework::OpProto op_proto;
+  paddle::framework::proto::OpProto op_proto;
   paddle::framework::OpAttrChecker op_checker;
   auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
   ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
@@ -35,7 +35,7 @@ TEST(ProtoMaker, DuplicatedAttr) {
 
 class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
  public:
-  TestInOutProtoMaker(paddle::framework::OpProto* proto,
+  TestInOutProtoMaker(paddle::framework::proto::OpProto* proto,
                       paddle::framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("input", "input of test op");
@@ -44,7 +44,7 @@ class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };
 
 TEST(ProtoMaker, DuplicatedInOut) {
-  paddle::framework::OpProto op_proto;
+  paddle::framework::proto::OpProto op_proto;
   paddle::framework::OpAttrChecker op_checker;
   auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
   ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
index 8dedd873aad648174b770b84e5232cd17b577e72..dfa151316daeccfe92e26818165a694b78b5df62 100644
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -31,7 +31,8 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
 }
 
 static VariableNameMap ConvertOpDescVarsToVarNameMap(
-    const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars) {
+    const google::protobuf::RepeatedPtrField<proto::OpDesc::Var>&
+        op_desc_vars) {
   VariableNameMap ret_val;
   for (auto& var : op_desc_vars) {
     auto& var_names = ret_val[var.parameter()];
@@ -43,9 +44,10 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap(
   return ret_val;
 }
 
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
+    const proto::OpDesc& op_desc) {
   VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be"
-             "used in unit tests. Use CreateOp(const OpDescBind& op_desc) "
+             "used in unit tests. Use CreateOp(const OpDesc& op_desc) "
              "instead.";
   VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
   VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
@@ -57,7 +59,7 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
   return CreateOp(op_desc.type(), inputs, outputs, attrs);
 }
 
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDescBind& op_desc) {
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
   return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(),
                   op_desc.GetAttrMap());
 }
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index b29238432b05d81e984e1f4c269a00b01a4229cc..bdaa25918155caca4b64b0ed60aa3f6be03eb12f 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -61,25 +61,14 @@ struct OperatorRegistrar : public Registrar {
 
 class OpRegistry {
  public:
-  template <typename OpType, typename ProtoMakerType, typename GradOpType>
-  static void RegisterOp(const std::string& op_type,
-                         const std::string& grad_op_type) {
-    OperatorRegistrar<OpType, ProtoMakerType> reg(op_type.c_str());
-    reg.info.grad_op_type_ = grad_op_type;
-    // register gradient op
-    if (!grad_op_type.empty()) {
-      OperatorRegistrar<GradOpType> grad_reg(grad_op_type.c_str());
-    }
-  }
-
   static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
                                                 const VariableNameMap& inputs,
                                                 const VariableNameMap& outputs,
                                                 AttributeMap attrs);
 
-  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
+  static std::unique_ptr<OperatorBase> CreateOp(const proto::OpDesc& op_desc);
 
-  static std::unique_ptr<OperatorBase> CreateOp(const OpDescBind& op_desc);
+  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
 };
 
 template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
@@ -90,30 +79,31 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
   using KERNEL_TYPE =
       typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;
 
-  void operator()(const char* op_type) const {
+  void operator()(const char* op_type, const char* library_type) const {
     using T = typename KERNEL_TYPE::ELEMENT_TYPE;
-    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType());
+    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
+                     DataLayout::kAnyLayout, StringToLibraryType(library_type));
     OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
 
     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
     OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
         func;
-    func(op_type);
+    func(op_type, library_type);
   }
 };
 
 template <typename PlaceType, size_t I, typename... KernelType>
 struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
-  void operator()(const char* op_type) const {}
+  void operator()(const char* op_type, const char* library_type) const {}
 };
 
 // User can register many kernel in one place. The data type could be different.
 template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
  public:
-  explicit OpKernelRegistrar(const char* op_type) {
+  explicit OpKernelRegistrar(const char* op_type, const char* library_type) {
     OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
-    func(op_type);
+    func(op_type, library_type);
   }
 };
 
@@ -126,6 +116,14 @@ class OpKernelRegistrar : public Registrar {
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
+/*
+  The variadic arguments should be class types derived from one of the
+  following classes:
+    OpProtoAndCheckerMaker
+    GradOpDescMakerBase
+    VarTypeInference
+    InferShapeBase
+*/
 #define REGISTER_OPERATOR(op_type, op_class, ...)                      \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
       __reg_op__##op_type,                                             \
@@ -144,20 +142,29 @@ class OpKernelRegistrar : public Registrar {
   }
 
 /**
- * Macro to register Operator.
+ * Macro to register Operator. When the input is duplicable, you should
+ * use REGISTER_OP_EX with deop_empty_grad=false instead.
  */
-#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,       \
-                    grad_op_class)                                         \
-  REGISTER_OPERATOR(grad_op_type, grad_op_class);                          \
-  class _GradOpDescMaker_##grad_op_type##_                                 \
-      : public ::paddle::framework::DefaultGradOpDescMaker<true> {         \
-    using ::paddle::framework::DefaultGradOpDescMaker<                     \
-        true>::DefaultGradOpDescMaker;                                     \
-                                                                           \
-   protected:                                                              \
-    virtual std::string GradOpType() const { return #grad_op_type; }       \
-  };                                                                       \
-  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \
+#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
+                    grad_op_class)                                   \
+  REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,    \
+                 grad_op_class, true)
+
+// When an argument is duplicable, we need to use this version.
+// Perhaps we can omit DropEmptyIG template parameter and
+// only have one version of REGISTER_OP.
+#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,       \
+                       grad_op_class, drop_empty_grad)                        \
+  REGISTER_OPERATOR(grad_op_type, grad_op_class);                             \
+  class _GradOpDescMaker_##grad_op_type##_                                    \
+      : public ::paddle::framework::DefaultGradOpDescMaker<drop_empty_grad> { \
+    using ::paddle::framework::DefaultGradOpDescMaker<                        \
+        drop_empty_grad>::DefaultGradOpDescMaker;                             \
+                                                                              \
+   protected:                                                                 \
+    virtual std::string GradOpType() const { return #grad_op_type; }          \
+  };                                                                          \
+  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_,    \
                     op_maker_class);
 
 #define REGISTER_OP_WITH_KERNEL(op_type, ...)                         \
@@ -175,14 +182,15 @@ class OpKernelRegistrar : public Registrar {
       __reg_op_kernel_##op_type##_##DEVICE_TYPE##__,                      \
       "REGISTER_OP_KERNEL must be called in global namespace");           \
   static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__> \
-      __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__(#op_type);      \
+      __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__(#op_type,       \
+                                                          #DEVICE_TYPE);  \
   int TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE() {                \
     __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__.Touch();          \
     return 0;                                                             \
   }
 
 #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
-  REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__)
+  REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
 
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index b860fe6cac773d1e85adecc43f5dfec42b6c7661..cef530c6e639f6e2188869fa57d114ec6b885aa8 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "paddle/framework/op_registry.h"
 #include <gtest/gtest.h>
 
@@ -8,8 +22,7 @@ namespace framework {
 class CosineOp : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
+  void Run(const Scope& scope, const platform::Place& place) const override {}
 };
 
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -28,8 +41,7 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 class MyTestOp : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
+  void Run(const Scope& scope, const platform::Place& place) const override {}
 };
 
 class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -51,7 +63,7 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 
 static void BuildVar(const std::string& param_name,
                      std::initializer_list<const char*> arguments,
-                     paddle::framework::OpDesc::Var* var) {
+                     paddle::framework::proto::OpDesc::Var* var) {
   var->set_parameter(param_name);
   for (auto& arg_name : arguments) {
     var->add_arguments(arg_name);
@@ -63,7 +75,7 @@ REGISTER_OP_WITHOUT_GRADIENT(my_test_op, paddle::framework::MyTestOp,
                              paddle::framework::MyTestOpProtoAndCheckerMaker);
 
 TEST(OpRegistry, CreateOp) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("cos_sim");
   BuildVar("input", {"aa"}, op_desc.add_inputs());
   BuildVar("output", {"bb"}, op_desc.add_outputs());
@@ -71,26 +83,26 @@ TEST(OpRegistry, CreateOp) {
   float scale = 3.3;
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(scale);
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::framework::Scope scope;
-  paddle::platform::CPUDeviceContext dev_ctx;
-  op->Run(scope, dev_ctx);
+  paddle::platform::CPUPlace cpu_place;
+  op->Run(scope, cpu_place);
   float scale_get = op->Attr<float>("scale");
   ASSERT_EQ(scale_get, scale);
 }
 
 TEST(OpRegistry, IllegalAttr) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("cos_sim");
   BuildVar("input", {"aa"}, op_desc.add_inputs());
   BuildVar("output", {"bb"}, op_desc.add_outputs());
 
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(-2.0);
 
   bool caught = false;
@@ -108,7 +120,7 @@ TEST(OpRegistry, IllegalAttr) {
 }
 
 TEST(OpRegistry, DefaultValue) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("cos_sim");
   BuildVar("input", {"aa"}, op_desc.add_inputs());
   BuildVar("output", {"bb"}, op_desc.add_outputs());
@@ -117,13 +129,13 @@ TEST(OpRegistry, DefaultValue) {
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::framework::Scope scope;
-  paddle::platform::CPUDeviceContext dev_ctx;
-  op->Run(scope, dev_ctx);
+  paddle::platform::CPUPlace cpu_place;
+  op->Run(scope, cpu_place);
   ASSERT_EQ(op->Attr<float>("scale"), 1.0);
 }
 
 TEST(OpRegistry, CustomChecker) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("my_test_op");
   BuildVar("input", {"ii"}, op_desc.add_inputs());
   BuildVar("output", {"oo"}, op_desc.add_outputs());
@@ -145,7 +157,7 @@ TEST(OpRegistry, CustomChecker) {
   // set 'test_attr' set to an illegal value
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("test_attr");
-  attr->set_type(paddle::framework::AttrType::INT);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
   attr->set_i(3);
   caught = false;
   try {
@@ -164,12 +176,12 @@ TEST(OpRegistry, CustomChecker) {
   op_desc.mutable_attrs()->Clear();
   attr = op_desc.mutable_attrs()->Add();
   attr->set_name("test_attr");
-  attr->set_type(paddle::framework::AttrType::INT);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
   attr->set_i(4);
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  paddle::platform::CPUDeviceContext dev_ctx;
+  paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
-  op->Run(scope, dev_ctx);
+  op->Run(scope, cpu_place);
   int test_attr = op->Attr<int>("test_attr");
   ASSERT_EQ(test_attr, 4);
 }
@@ -184,3 +196,71 @@ TEST(OperatorRegistrar, Test) {
   using namespace paddle::framework;
   OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
 }
+
+namespace paddle {
+namespace framework {
+
+class OpKernelTestMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddComment("NoGradOp, same input output. no Grad");
+  }
+};
+
+class OpWithKernelTest : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetActualKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(proto::DataType::FP32, ctx.device_context());
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OpKernelTest : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {}
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(op_with_kernel,
+                             paddle::framework::OpWithKernelTest,
+                             paddle::framework::OpKernelTestMaker);
+REGISTER_OP_CPU_KERNEL(
+    op_with_kernel,
+    paddle::framework::OpKernelTest<paddle::platform::CPUDeviceContext, float>);
+
+REGISTER_OP_CUDA_KERNEL(op_with_kernel,
+                        paddle::framework::OpKernelTest<
+                            paddle::platform::CUDADeviceContext, float>);
+
+TEST(OperatorRegistrar, CPU) {
+  paddle::framework::proto::OpDesc op_desc;
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+
+  op_desc.set_type("op_with_kernel");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+  op->Run(scope, cpu_place);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(OperatorRegistrar, CUDA) {
+  paddle::framework::proto::OpDesc op_desc;
+  paddle::platform::CUDAPlace cuda_place(0);
+  paddle::framework::Scope scope;
+
+  op_desc.set_type("op_with_kernel");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+  op->Run(scope, cuda_place);
+}
+#endif
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index e83d7547831744333d6a9c36e842d840a2a0dc03..c0be11294c4a6b49ae4bc2f805f76e9f04508349 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/operator.h"
 #include <algorithm>
 #include <atomic>
+
+#include "paddle/framework/data_transform.h"
+#include "paddle/framework/executor.h"
 #include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/operator.h"
 #include "paddle/framework/shape_inference.h"
 #include "paddle/framework/var_type.h"
 
@@ -240,12 +243,6 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
   return res;
 }
 
-std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key) {
-  os << "place[" << kernel_key.place_ << "]:data_type[" << kernel_key.data_type_
-     << "]";
-  return os;
-}
-
 bool OpSupportGPU(const std::string& op_type) {
   auto& all_kernels = OperatorWithKernel::AllOpKernels();
   auto it = all_kernels.find(op_type);
@@ -377,7 +374,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
     }
   }
 
-  VarDesc::VarType GetVarType(const std::string& name) const override {
+  proto::VarDesc::VarType GetVarType(const std::string& name) const override {
     auto* var = scope_.FindVar(name);
     return ToVarType(var->Type());
   }
@@ -388,11 +385,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
 };
 
 void OperatorWithKernel::Run(const Scope& scope,
-                             const platform::DeviceContext& dev_ctx) const {
+                             const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
   this->InferShape(&infer_shape_ctx);
-
-  ExecutionContext ctx(*this, scope, dev_ctx);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto dev_ctx = pool.Get(place);
 
   // check if op[type] has kernel registered.
   auto& all_op_kernels = AllOpKernels();
@@ -404,20 +401,76 @@ void OperatorWithKernel::Run(const Scope& scope,
 
   // check if op[type] have kernel for kernel_key
   OpKernelMap& kernels = kernels_iter->second;
-  auto kernel_key = GetKernelType(ctx);
-  auto kernel_iter = kernels.find(kernel_key);
+
+  ExecutionContext ctx(*this, scope, *dev_ctx);
+  auto actual_kernel_key = GetActualKernelType(ctx);
+  auto expected_kernel_key = GetExpectedKernelType(actual_kernel_key);
+  auto kernel_iter = kernels.find(expected_kernel_key);
 
   if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("The operator %s does not support %s", type_, kernel_key);
+    PADDLE_THROW("The operator %s does not support %s", type_,
+                 expected_kernel_key);
+  }
+
+  if (actual_kernel_key == expected_kernel_key) {
+    PADDLE_ENFORCE_EQ(actual_kernel_key.place_, expected_kernel_key.place_,
+                      "Currently, model parallelism is only supported between "
+                      "CPU and other devices. For example, multi-GPU model "
+                      "parallelism will failed.");
+  } else {
+    const DataTransformFn* trans_fun =
+        DataTransformFnMap::Instance().GetNullable(
+            std::make_pair(actual_kernel_key, expected_kernel_key));
+    if (trans_fun) {
+      auto input_vars = this->InputVars();
+      // TODO(qijun) filter the input vars that do not need to be transformed
+
+      // filter vars that has been transformed
+      std::vector<std::string> need_trans;
+      for (auto var_name : input_vars) {
+        auto var_name_trans =
+            var_name + framework::KernelTypeToString(expected_kernel_key);
+        if (!scope.FindVar(var_name_trans)) {
+          const_cast<Scope&>(scope).Var(var_name_trans);
+          need_trans.push_back(var_name);
+        }
+      }
+
+      if (!need_trans.empty()) {
+        // TODO(qijun) get appropriate DeviceContext from DeviceContext pool
+        platform::DeviceContext* trans_dev_ctx = nullptr;
+        std::vector<platform::DeviceContext*> trans_dev_ctx_vec{trans_dev_ctx};
+
+        // Wait for transform starting
+        dev_ctx->Wait();
+
+        for (auto var_name : need_trans) {
+          (*trans_fun)(trans_dev_ctx_vec, *(scope.FindVar(var_name)),
+                       scope.FindVar(var_name + framework::KernelTypeToString(
+                                                    expected_kernel_key)));
+        }
+        // Wait for data transform finishing
+        for (auto ctx : trans_dev_ctx_vec) {
+          ctx->Wait();
+        }
+      }
+    }
   }
 
   kernel_iter->second->Compute(ctx);
 }
-OpKernelType OperatorWithKernel::GetKernelType(
+
+OpKernelType OperatorWithKernel::GetActualKernelType(
     const ExecutionContext& ctx) const {
   return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
 }
-DataType OperatorWithKernel::IndicateDataType(
+
+OpKernelType OperatorWithKernel::GetExpectedKernelType(
+    const OpKernelType& actual_kernel_type) const {
+  return actual_kernel_type;
+}
+
+proto::DataType OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
   auto& scope = ctx.scope();
   int data_type = -1;
@@ -443,7 +496,7 @@ DataType OperatorWithKernel::IndicateDataType(
     }
   }
   PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
-  return static_cast<DataType>(data_type);
+  return static_cast<proto::DataType>(data_type);
 }
 
 }  // namespace framework
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index e60dbfc313f732120f6879fd6fd19ca8abc06813..d0a9b643d565d6651fd7ec0b515f088362852ba3 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -23,15 +23,14 @@ limitations under the License. */
 #include "glog/logging.h"  // For VLOG
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/block_desc.h"
-#include "paddle/framework/data_type.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_info.h"
+#include "paddle/framework/op_kernel_type.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/selected_rows.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device_context.h"
-#include "paddle/platform/place.h"
 #include "paddle/platform/variant.h"
 #include "paddle/utils/Error.h"
 
@@ -53,6 +52,11 @@ constexpr char kGradVarSuffix[] = "@GRAD";
 /// Variables with this suffix are supposed to be filled up with zeros.
 constexpr char kZeroVarSuffix[] = "@ZERO";
 
+// define some kernel hint
+const std::string kUseCPU = "use_cpu";
+const std::string kUseCUDNN = "use_cudnn";
+const std::string kUseMKLDNN = "use_mkldnn";
+
 inline std::string GradVarName(const std::string& var_name) {
   return var_name + kGradVarSuffix;
 }
@@ -83,8 +87,10 @@ class OperatorBase {
   virtual std::string DebugString() const;
 
   /// Net will call this function to Run an op.
-  virtual void Run(const Scope& scope,
-                   const platform::DeviceContext& dev_ctx) const = 0;
+  virtual void Run(const Scope& scope, const platform::Place& place) const = 0;
+
+  // FIXME(typhoonzero): this is only used for recv_op to stop event_loop.
+  virtual void Stop() {}
 
   virtual bool IsNetOp() const { return false; }
 
@@ -159,8 +165,7 @@ class OperatorBase {
 class NOP : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
+  void Run(const Scope& scope, const platform::Place& place) const override {}
   std::unique_ptr<OperatorBase> Clone() const override {
     return std::unique_ptr<OperatorBase>(new NOP(*this));
   }
@@ -345,33 +350,6 @@ class OpKernel : public OpKernelBase {
   using ELEMENT_TYPE = T;
 };
 
-struct OpKernelType {
-  struct Hash {
-    std::hash<int> hash_;
-    size_t operator()(const OpKernelType& key) const {
-      int place = key.place_.which();
-      int data_type = static_cast<int>(key.data_type_);
-      int pre_hash = data_type << NUM_PLACE_TYPE_LIMIT_IN_BIT |
-                     (place & ((1 << NUM_PLACE_TYPE_LIMIT_IN_BIT) - 1));
-      return hash_(pre_hash);
-    }
-  };
-
-  platform::Place place_;
-  DataType data_type_;
-
-  OpKernelType(DataType data_type, platform::Place place)
-      : place_(place), data_type_(data_type) {}
-
-  OpKernelType(DataType data_type, const platform::DeviceContext& dev_ctx)
-      : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
-
-  bool operator==(const OpKernelType& o) const {
-    return platform::places_are_same_class(place_, o.place_) &&
-           data_type_ == o.data_type_;
-  }
-};
-
 class OperatorWithKernel : public OperatorBase {
  public:
   using OpKernelMap =
@@ -382,8 +360,7 @@ class OperatorWithKernel : public OperatorBase {
                      const VariableNameMap& outputs, const AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const final;
+  void Run(const Scope& scope, const platform::Place& place) const final;
 
   static std::unordered_map<std::string /* op_type */, OpKernelMap>&
   AllOpKernels() {
@@ -404,16 +381,16 @@ class OperatorWithKernel : public OperatorBase {
   }
 
  protected:
-  virtual OpKernelType GetKernelType(const ExecutionContext& ctx) const;
+  virtual OpKernelType GetActualKernelType(const ExecutionContext& ctx) const;
+  virtual OpKernelType GetExpectedKernelType(
+      const OpKernelType& actual_kernel_type) const;
 
  private:
   // indicate kernel DataType by input data. Defaultly all input data must be
   // same.
-  DataType IndicateDataType(const ExecutionContext& ctx) const;
+  proto::DataType IndicateDataType(const ExecutionContext& ctx) const;
 };
 
-std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key);
-
 extern bool OpSupportGPU(const std::string& op_type);
 
 }  // namespace framework
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index b678178454ff63e4217f0be7a9938a9ba183cda4..4d38a7ada91af834aa1a19b49e36d606ebe786ba 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -11,11 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#include "paddle/framework/operator.h"
 #include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace framework {
@@ -27,8 +28,7 @@ class OpWithoutKernelTest : public OperatorBase {
   OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
                       const VariableNameMap& outputs, const AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs), x(1) {}
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+  void Run(const Scope& scope, const platform::Place& place) const override {
     ++op_run_num;
     ASSERT_EQ(static_cast<int>(inputs_.size()), 1);
     ASSERT_EQ(static_cast<int>(outputs_.size()), 1);
@@ -41,10 +41,9 @@ class OpWithoutKernelTest : public OperatorBase {
   int x{0};
 };
 
-class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto,
-                                           OpAttrChecker* op_checker)
+  OpWithoutKernelCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("input", "input of test op");
     AddOutput("output", "output of test op");
@@ -58,35 +57,36 @@ class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 
 static void BuildVar(const std::string& param_name,
                      std::initializer_list<const char*> arguments,
-                     paddle::framework::OpDesc::Var* var) {
+                     paddle::framework::proto::OpDesc::Var* var) {
   var->set_parameter(param_name);
   for (auto& arg_name : arguments) {
     *var->mutable_arguments()->Add() = arg_name;
   }
 }
 
-REGISTER_OP_WITHOUT_GRADIENT(
-    test_operator, paddle::framework::OpWithoutKernelTest,
-    paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);
+REGISTER_OP_WITHOUT_GRADIENT(test_operator,
+                             paddle::framework::OpWithoutKernelTest,
+                             paddle::framework::OpWithoutKernelCheckerMaker);
 
 TEST(OperatorBase, all) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::InitDevices({"CPU"});
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("test_operator");
   BuildVar("input", {"IN1"}, op_desc.add_inputs());
   BuildVar("output", {"OUT1"}, op_desc.add_outputs());
 
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(3.14);
 
-  paddle::platform::CPUDeviceContext device_context;
+  paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   scope.Var("OUT1");
   ASSERT_EQ(paddle::framework::op_run_num, 0);
-  op->Run(scope, device_context);
+  op->Run(scope, cpu_place);
   ASSERT_EQ(paddle::framework::op_run_num, 1);
 }
 
@@ -114,8 +114,8 @@ class OpWithKernelTest : public OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {}
-  OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
-    return OpKernelType(DataType::FP32, ctx.GetPlace());
+  OpKernelType GetActualKernelType(const ExecutionContext& ctx) const override {
+    return OpKernelType(proto::DataType::FP32, ctx.GetPlace());
   }
 };
 
@@ -123,7 +123,6 @@ template <typename T1, typename T2>
 class CPUKernelTest : public OpKernel<float> {
  public:
   void Compute(const ExecutionContext& ctx) const {
-    std::cout << "this is cpu kernel" << std::endl;
     std::cout << ctx.op().DebugString() << std::endl;
     cpu_kernel_run_num++;
     ASSERT_EQ(ctx.op().Input("x"), "IN1");
@@ -195,22 +194,23 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,
 
 // test with single input
 TEST(OpKernel, all) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::InitDevices({"CPU"});
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("op_with_kernel");
   BuildVar("x", {"IN1"}, op_desc.add_inputs());
   BuildVar("y", {"OUT1"}, op_desc.add_outputs());
 
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(3.14);
 
-  paddle::platform::CPUDeviceContext cpu_device_context;
+  paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
-  op->Run(scope, cpu_device_context);
+  op->Run(scope, cpu_place);
   ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
 }
 
@@ -224,7 +224,9 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 TEST(OpKernel, multi_inputs) {
   using namespace paddle::framework;
 
-  OpDesc op_desc;
+  paddle::framework::InitDevices({"CPU"});
+  proto::OpDesc op_desc;
+
   op_desc.set_type("op_multi_inputs_with_kernel");
   BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
   BuildVar("k", {"k0"}, op_desc.add_inputs());
@@ -232,10 +234,10 @@ TEST(OpKernel, multi_inputs) {
 
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(3.14);
 
-  paddle::platform::CPUDeviceContext cpu_device_context;
+  paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
   scope.Var("x0")->GetMutable<LoDTensor>();
   scope.Var("x1")->GetMutable<LoDTensor>();
@@ -245,7 +247,7 @@ TEST(OpKernel, multi_inputs) {
   scope.Var("y1")->GetMutable<LoDTensor>();
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  op->Run(scope, cpu_device_context);
+  op->Run(scope, cpu_place);
 }
 
 class OperatorClone : public paddle::framework::OperatorBase {
@@ -257,10 +259,11 @@ class OperatorClone : public paddle::framework::OperatorBase {
                 const paddle::framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const paddle::framework::Scope& scope,
-           const paddle::platform::DeviceContext& dev_ctx) const override {}
+           const paddle::platform::Place& place) const override {}
 };
 
 TEST(Operator, Clone) {
+  paddle::framework::InitDevices({"CPU"});
   OperatorClone a("ABC", paddle::framework::VariableNameMap{},
                   paddle::framework::VariableNameMap{},
                   paddle::framework::AttributeMap{});
diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc
index 4af8d94563ad0ecf6fcc6fe0575b0f69006a9a2d..b5d9e5e385c1ba57169ef885824fc23b0f130692 100644
--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -18,49 +18,49 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-BlockDescBind *ProgramDescBind::AppendBlock(const BlockDescBind &parent) {
+BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
   auto *b = desc_.add_blocks();
   b->set_parent_idx(parent.ID());
   b->set_idx(desc_.blocks_size() - 1);
-  blocks_.emplace_back(new BlockDescBind(this, b));
+  blocks_.emplace_back(new BlockDesc(this, b));
   return blocks_.back().get();
 }
 
-ProgramDesc *ProgramDescBind::Proto() {
+proto::ProgramDesc *ProgramDesc::Proto() {
   for (auto &block : blocks_) {
     block->Flush();
   }
   return &desc_;
 }
 
-ProgramDescBind::ProgramDescBind() {
+ProgramDesc::ProgramDesc() {
   auto *block = desc_.mutable_blocks()->Add();
   block->set_idx(kRootBlockIndex);
   block->set_parent_idx(kNoneBlockIndex);
-  blocks_.emplace_back(new BlockDescBind(this, block));
+  blocks_.emplace_back(new BlockDesc(this, block));
 }
 
-ProgramDescBind::ProgramDescBind(const ProgramDescBind &o) {
+ProgramDesc::ProgramDesc(const ProgramDesc &o) {
   desc_ = o.desc_;
 
   for (int i = 0; i < desc_.blocks_size(); ++i) {
     auto *block = desc_.mutable_blocks(i);
-    blocks_.emplace_back(new BlockDescBind(*o.blocks_[i], block, this));
+    blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
   }
 }
 
-ProgramDescBind::ProgramDescBind(const ProgramDesc &desc) {
+ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
   desc_ = desc;
   for (auto &block_desc : *desc_.mutable_blocks()) {
-    blocks_.emplace_back(new BlockDescBind(this, &block_desc));
+    blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
 }
 
-ProgramDescBind::ProgramDescBind(const std::string &binary_str) {
+ProgramDesc::ProgramDesc(const std::string &binary_str) {
   PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
                  "Fail to parse program_desc from binary string.");
   for (auto &block_desc : *desc_.mutable_blocks()) {
-    blocks_.emplace_back(new BlockDescBind(this, &block_desc));
+    blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
 }
 
diff --git a/paddle/framework/program_desc.h b/paddle/framework/program_desc.h
index b1cb086de4345902482d8254b8aeec041ecf81bc..15a962bb696d6172acd1a83cf9bb1ffd0846d449 100644
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -23,32 +23,32 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class BlockDescBind;
+class BlockDesc;
 
-class ProgramDescBind {
+class ProgramDesc {
  public:
-  ProgramDescBind();
+  ProgramDesc();
 
-  explicit ProgramDescBind(const ProgramDesc &desc);
+  explicit ProgramDesc(const proto::ProgramDesc &desc);
 
-  ProgramDescBind(const ProgramDescBind &o);
+  ProgramDesc(const ProgramDesc &o);
 
-  explicit ProgramDescBind(const std::string &binary_str);
+  explicit ProgramDesc(const std::string &binary_str);
 
-  BlockDescBind *AppendBlock(const BlockDescBind &parent);
+  BlockDesc *AppendBlock(const BlockDesc &parent);
 
-  BlockDescBind *MutableBlock(size_t idx) { return blocks_[idx].get(); }
+  BlockDesc *MutableBlock(size_t idx) { return blocks_[idx].get(); }
 
-  const BlockDescBind &Block(size_t idx) const { return *blocks_[idx]; }
+  const BlockDesc &Block(size_t idx) const { return *blocks_[idx]; }
 
   size_t Size() const { return blocks_.size(); }
 
-  ProgramDesc *Proto();
+  proto::ProgramDesc *Proto();
 
  private:
-  ProgramDesc desc_;
+  proto::ProgramDesc desc_;
 
-  std::vector<std::unique_ptr<BlockDescBind>> blocks_;
+  std::vector<std::unique_ptr<BlockDesc>> blocks_;
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/program_desc_test.cc b/paddle/framework/program_desc_test.cc
index 83e7286e0ec3639fa589b0958922543a3ba16a00..59947c9f2189348226b7ff6c2b9315196bbf55fa 100644
--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/framework/program_desc_test.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/program_desc.h"
 #include "gtest/gtest.h"
@@ -19,18 +19,18 @@
 namespace paddle {
 namespace framework {
 TEST(ProgramDesc, copy_ctor) {
-  ProgramDescBind program;
+  ProgramDesc program;
   auto* global_block = program.MutableBlock(0);
   auto* x = global_block->Var("X");
-  x->SetType(VarDesc_VarType_LOD_TENSOR);
+  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   x->SetLoDLevel(0);
-  x->SetDataType(FP32);
+  x->SetDataType(proto::FP32);
   x->SetShape({1000, 784});
 
   auto* y = global_block->Var("Y");
-  y->SetType(VarDesc_VarType_LOD_TENSOR);
+  y->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   y->SetLoDLevel(0);
-  y->SetDataType(FP32);
+  y->SetDataType(proto::FP32);
   y->SetShape({784, 100});
 
   auto* op = global_block->AppendOp();
@@ -39,15 +39,15 @@ TEST(ProgramDesc, copy_ctor) {
   op->SetInput("Y", {y->Name()});
 
   auto* out = global_block->Var("Out");
-  out->SetType(VarDesc_VarType_LOD_TENSOR);
+  out->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   op->SetOutput("Y", {out->Name()});
 
-  ProgramDescBind program_copy(program);
+  ProgramDesc program_copy(program);
 
   auto* global_block_copy = program_copy.MutableBlock(0);
   ASSERT_NE(global_block, global_block_copy);
 
-  auto assert_same_var = [&](const std::string& name, VarDescBind* var_before) {
+  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
     ASSERT_TRUE(global_block_copy->HasVar(name));
     auto* copy = global_block_copy->Var(name);
     ASSERT_NE(copy, var_before);
@@ -81,18 +81,18 @@ TEST(ProgramDesc, copy_ctor) {
 }
 
 TEST(ProgramDescBind, serialize_and_deserialize) {
-  ProgramDescBind program_origin;
+  ProgramDesc program_origin;
   auto* global_block = program_origin.MutableBlock(0);
   auto* x = global_block->Var("X");
-  x->SetType(VarDesc_VarType_LOD_TENSOR);
+  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   x->SetLoDLevel(0);
-  x->SetDataType(FP32);
+  x->SetDataType(proto::FP32);
   x->SetShape({1000, 784});
 
   auto* y = global_block->Var("Y");
-  y->SetType(VarDesc_VarType_LOD_TENSOR);
+  y->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   y->SetLoDLevel(0);
-  y->SetDataType(FP32);
+  y->SetDataType(proto::FP32);
   y->SetShape({784, 100});
 
   auto* op = global_block->AppendOp();
@@ -101,17 +101,17 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
   op->SetInput("Y", {y->Name()});
 
   auto* out = global_block->Var("Out");
-  out->SetType(VarDesc_VarType_LOD_TENSOR);
+  out->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   op->SetOutput("Y", {out->Name()});
 
   std::string binary_str;
   program_origin.Proto()->SerializeToString(&binary_str);
 
-  ProgramDescBind program_restored(binary_str);
+  ProgramDesc program_restored(binary_str);
   auto* global_block_restored = program_restored.MutableBlock(0);
   ASSERT_NE(global_block, global_block_restored);
 
-  auto assert_same_var = [&](const std::string& name, VarDescBind* var_before) {
+  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
     ASSERT_TRUE(global_block_restored->HasVar(name));
     auto* restored = global_block_restored->Var(name);
     ASSERT_NE(restored, var_before);
diff --git a/paddle/framework/prune.cc b/paddle/framework/prune.cc
index da76052eb4d3067214841af72a35cebb26477e7f..25eb813ffb96e9b1e13299421ead9f85c02da59f 100644
--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
@@ -29,7 +29,7 @@ const std::string kFetchOpType = "fetch";
 const std::string kDropOutOpType = "dropout";
 const std::string kBatchNormOpType = "batch_norm";
 
-bool HasDependentVar(const OpDesc& op_desc,
+bool HasDependentVar(const proto::OpDesc& op_desc,
                      const std::set<std::string>& dependent_vars) {
   for (auto& var : op_desc.outputs()) {
     for (auto& argu : var.arguments()) {
@@ -41,14 +41,15 @@ bool HasDependentVar(const OpDesc& op_desc,
   return false;
 }
 
-bool IsTarget(const OpDesc& op_desc) {
+bool IsTarget(const proto::OpDesc& op_desc) {
   if (op_desc.has_is_target()) {
     return op_desc.is_target();
   }
   return false;
 }
 
-void prune_impl(const ProgramDesc& input, ProgramDesc* output, int block_id) {
+void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
+                int block_id) {
   // TODO(tonyyang-svail):
   //    - will change to use multiple blocks for RNN op and Cond Op
 
@@ -104,12 +105,12 @@ void prune_impl(const ProgramDesc& input, ProgramDesc* output, int block_id) {
 }
 
 // TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
-void Prune(const ProgramDesc& input, ProgramDesc* output) {
+void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
   prune_impl(input, output, 0);
 }
 
-void inference_optimize_impl(const ProgramDesc& input, ProgramDesc* output,
-                             int block_id) {
+void inference_optimize_impl(const proto::ProgramDesc& input,
+                             proto::ProgramDesc* output, int block_id) {
   *output = input;
   auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
   for (auto& op_desc : *op_field) {
@@ -125,7 +126,8 @@ void inference_optimize_impl(const ProgramDesc& input, ProgramDesc* output,
   }
 }
 
-void InferenceOptimize(const ProgramDesc& input, ProgramDesc* output) {
+void InferenceOptimize(const proto::ProgramDesc& input,
+                       proto::ProgramDesc* output) {
   inference_optimize_impl(input, output, 0);
 }
 
diff --git a/paddle/framework/prune.h b/paddle/framework/prune.h
index 23db014894348094a98e043aa744c6f0d27b2640..593292523d0c14136791bb804a4721a0740b47ba 100644
--- a/paddle/framework/prune.h
+++ b/paddle/framework/prune.h
@@ -20,9 +20,10 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void Prune(const ProgramDesc& input, ProgramDesc* output);
+void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output);
 
-void InferenceOptimize(const ProgramDesc& input, ProgramDesc* output);
+void InferenceOptimize(const proto::ProgramDesc& input,
+                       proto::ProgramDesc* output);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/prune_test.cc b/paddle/framework/prune_test.cc
index f21df37a292fd1e039ee8f8fa26244e26c978cae..d76c5abca94cb87220ce73537a8657c3ec695f4d 100644
--- a/paddle/framework/prune_test.cc
+++ b/paddle/framework/prune_test.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/prune.h"
 
@@ -29,12 +29,12 @@ namespace ops = paddle::operators;
 
 void AddOp(const std::string &type, const f::VariableNameMap &inputs,
            const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           paddle::framework::BlockDescBind *block) {
+           paddle::framework::BlockDesc *block) {
   // insert output
   for (auto kv : outputs) {
     for (auto v : kv.second) {
       auto var = block->Var(v);
-      var->SetDataType(paddle::framework::DataType::FP32);
+      var->SetDataType(paddle::framework::proto::DataType::FP32);
     }
   }
 
@@ -51,26 +51,26 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
 }
 
 TEST(Prune, one_operator) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
   AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
         block);
 
-  f::ProgramDesc *pdesc = program.Proto();
-  f::ProgramDesc pruned;
+  f::proto::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc pruned;
 
-  Prune(*pdesc, &pruned);
+  f::Prune(*pdesc, &pruned);
   PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 0);
 
   pdesc->mutable_blocks(0)->mutable_ops(0)->set_is_target(true);
-  Prune(*pdesc, &pruned);
+  f::Prune(*pdesc, &pruned);
   PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 1);
 }
 
 TEST(Prune, forward) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
   AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
         block);
@@ -81,19 +81,19 @@ TEST(Prune, forward) {
   AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, f::AttributeMap{},
         block);
 
-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
 
   for (int i = 0; i < pdesc->blocks(0).ops_size(); ++i) {
-    f::ProgramDesc pruned;
+    f::proto::ProgramDesc pruned;
     pdesc->mutable_blocks(0)->mutable_ops(i)->set_is_target(true);
-    Prune(*pdesc, &pruned);
+    f::Prune(*pdesc, &pruned);
     PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), i + 1);
   }
 }
 
 TEST(Prune, multi_input_op) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
   AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{},
         block);
@@ -104,17 +104,17 @@ TEST(Prune, multi_input_op) {
   AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}},
         f::AttributeMap{}, block);
 
-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
   pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true);
 
-  f::ProgramDesc pruned;
-  Prune(*pdesc, &pruned);
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
   PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 4);
 }
 
 TEST(Prune, multi_output_op) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
   AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
         f::AttributeMap{}, block);
@@ -123,17 +123,17 @@ TEST(Prune, multi_output_op) {
   AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
         block);
 
-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
   pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
 
-  f::ProgramDesc pruned;
-  Prune(*pdesc, &pruned);
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
   PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 2);
 }
 
 TEST(Prune, multi_target) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
   AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
         f::AttributeMap{}, block);
@@ -142,11 +142,11 @@ TEST(Prune, multi_target) {
   AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
         block);
 
-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
   pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
   pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
 
-  f::ProgramDesc pruned;
-  Prune(*pdesc, &pruned);
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
   PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 3);
 }
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index 656736e23846c8de50553a608c54a0bdd3272cb1..0c01d605bcd95f5796fba1e5a3351a2640b2898a 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -74,17 +74,9 @@ void Scope::DropKids() {
   kids_.clear();
 }
 
-std::vector<std::string> Scope::GetAllNames(bool recursive) const {
-  std::vector<std::string> known_vars(vars_.size());
-
-  if (recursive) {
-    for (auto& kid : kids_) {
-      auto kid_vars = kid->GetAllNames();
-      for (auto& p : kid_vars) {
-        known_vars.emplace_back(p);
-      }
-    }
-  }
+std::vector<std::string> Scope::LocalVarNames() const {
+  std::vector<std::string> known_vars;
+  known_vars.reserve(this->vars_.size());
   for (auto& p : vars_) {
     known_vars.emplace_back(p.first);
   }
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index 56e815db54b6385c4e4d87f456ed5d59113ca77b..10143326dfa201894c777b3e5e226d5ca5015eda 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -66,7 +66,7 @@ class Scope {
   void DropKids();
 
   // enumerate all the variables current contains.
-  std::vector<std::string> GetAllNames(bool recursive = false) const;
+  std::vector<std::string> LocalVarNames() const;
 
   // Rename variable to a new name
   void Rename(const std::string& origin_name,
diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc
index f738d5ba9ecda57ea25bb5f84057d1d0106eef66..0f5b86061dbdebde08badca7f984f4a2c8d7bc79 100644
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
@@ -61,7 +61,7 @@ TEST(Scope, GetAllNames) {
   Variable* v = s.Var("a");
   EXPECT_EQ(&s, s.FindScope(v));
 
-  std::vector<std::string> ans = s.GetAllNames();
+  std::vector<std::string> ans = s.LocalVarNames();
   std::string str;
   for (auto& var : ans) {
     str += var;
diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc
index 7dac1cfd5ee0c320c67bc0b2448417d258d6862b..e53cc0cdabc623ae358f1a3e21823a2f38ec3c62 100644
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include "paddle/framework/shape_inference.h"
 #include "grad_op_desc_maker.h"
 #include "paddle/framework/operator.h"
@@ -57,17 +57,17 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
     SetDim(names[i], dims[i]);
   }
 }
-std::vector<VarDesc::VarType> InferShapeContext::GetInputsVarType(
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetInputsVarType(
     const std::string &name) const {
   return GetVarTypes(Inputs(name));
 }
-std::vector<VarDesc::VarType> InferShapeContext::GetOutputsVarType(
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetOutputsVarType(
     const std::string &name) const {
   return GetVarTypes(Outputs(name));
 }
-std::vector<VarDesc::VarType> InferShapeContext::GetVarTypes(
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetVarTypes(
     const std::vector<std::string> &names) const {
-  std::vector<VarDesc::VarType> retv;
+  std::vector<proto::VarDesc::VarType> retv;
   retv.resize(names.size());
   std::transform(names.begin(), names.end(), retv.begin(),
                  std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index 46f2ea84b4b64292cc9026ef9864621efba79c7a..f93319d8f2fd4c5d388bd57fd595a6a5edd51775 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -27,8 +27,9 @@ class InferShapeContext {
   virtual bool HasInput(const std::string &name) const = 0;
   virtual bool HasOutput(const std::string &name) const = 0;
 
-  std::vector<VarDesc::VarType> GetInputsVarType(const std::string &name) const;
-  std::vector<VarDesc::VarType> GetOutputsVarType(
+  std::vector<proto::VarDesc::VarType> GetInputsVarType(
+      const std::string &name) const;
+  std::vector<proto::VarDesc::VarType> GetOutputsVarType(
       const std::string &name) const;
 
   virtual bool HasInputs(const std::string &name) const = 0;
@@ -65,10 +66,10 @@ class InferShapeContext {
   std::vector<framework::DDim> GetDims(
       const std::vector<std::string> &names) const;
 
-  std::vector<VarDesc::VarType> GetVarTypes(
+  std::vector<proto::VarDesc::VarType> GetVarTypes(
       const std::vector<std::string> &names) const;
 
-  virtual VarDesc::VarType GetVarType(const std::string &name) const = 0;
+  virtual proto::VarDesc::VarType GetVarType(const std::string &name) const = 0;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/tensor.cc b/paddle/framework/tensor.cc
index ea7b2a1f7b17d9abc2c2e14de5ecd1cf4a1a5027..f922e606249849e621e679f71d6dbe0f007c8464 100644
--- a/paddle/framework/tensor.cc
+++ b/paddle/framework/tensor.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/tensor.h"
 
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 6a0c5133c9a6bb326ca51755242e75b6eb9e5474..341a6949beeb2dfa64b23d2079bd8f48750a94f8 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -20,12 +20,12 @@ limitations under the License. */
 #include <typeindex>
 #include <vector>
 
+#include "paddle/framework/data_layout.h"
 #include "paddle/framework/ddim.h"
 #include "paddle/memory/memory.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
-#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 
@@ -115,6 +115,10 @@ class Tensor {
 
   inline void check_memory_size() const;
 
+  inline DataLayout layout() const { return layout_; }
+
+  inline void set_layout(const DataLayout layout) { layout_ = layout; }
+
  private:
   friend class LoDTensor;
 
@@ -173,6 +177,19 @@ class Tensor {
 
   DDim dims_;
 
+  /**
+   * @brief the layout of memory block, default is NHWC.
+   *
+   * @note the memory allocation order, describe how weight/data is stored
+   *       For example, in 4-D Tensor(rank=4), there are three commonly
+   *       used layout. They are
+   *            NCHW, NHWC, CHWN.
+   *       N,C,H,W for respectively the batch size, the number of
+   *       feature maps, the height.
+   */
+
+  DataLayout layout_ = DataLayout::kNHWC;
+
   /**
    * @brief   A PlaceHolder may be shared by more than one tensor.
    *
diff --git a/paddle/framework/tensor.md b/paddle/framework/tensor.md
index 7a80816d8e4ffa3a9462f3d9b87eff0f048466aa..0a27ac9bb6b03649d42e12100fda9e80a56e7f56 100644
--- a/paddle/framework/tensor.md
+++ b/paddle/framework/tensor.md
@@ -71,7 +71,7 @@ private:
 ```
 
 ```c++
-typedef boost::variant<GpuPlace, CpuPlace> Place;
+typedef boost::variant<CUDAPlace, CpuPlace> Place;
 typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>,
                        Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar;
 typedef boost::variant<
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index aba1f9f09329f890ef190f8820b958c56f017e89..6c6f298edc187a87677089e54c4c9046821282df 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -125,11 +125,11 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
           boost::get<platform::CPUPlace>(place), size, type));
     } else if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
+      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
     }
 #else
-      holder_.reset(new PlaceholderImpl<platform::GPUPlace>(
-          boost::get<platform::GPUPlace>(place), size, type));
+      holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
+          boost::get<platform::CUDAPlace>(place), size, type));
     }
 #endif
     offset_ = 0;
@@ -165,6 +165,7 @@ inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
     size_t base = numel() / dims_[0];
     Tensor dst;
     dst.holder_ = holder_;
+    dst.set_layout(layout_);
     DDim dst_dims = dims_;
     dst_dims[0] = end_idx - begin_idx;
     dst.Resize(dst_dims);
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index ceca64365a1a628642eb374a3e3bbdff490c955a..ca76a9fcb9079bab22f7b192c45903852c91797f 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -80,20 +80,20 @@ TEST(Tensor, MutableData) {
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
-    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
+    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CUDAPlace());
     EXPECT_NE(p1, nullptr);
     // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace());
+    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CUDAPlace());
     EXPECT_NE(p2, nullptr);
     EXPECT_NE(p1, p2);
     // set src_tensor a new dim with same size
     // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace());
+    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CUDAPlace());
     EXPECT_EQ(p1, p2);
     // set src_tensor a new dim with smaller size
     // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
+    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CUDAPlace());
     EXPECT_EQ(p1, p2);
   }
 #endif
@@ -130,7 +130,7 @@ TEST(Tensor, ShareDataWith) {
   {
     Tensor src_tensor;
     Tensor dst_tensor;
-    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
+    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CUDAPlace());
     dst_tensor.ShareDataWith(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
@@ -166,7 +166,7 @@ TEST(Tensor, Slice) {
 #ifdef PADDLE_WITH_CUDA
   {
     Tensor src_tensor;
-    src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
+    src_tensor.mutable_data<double>(make_ddim({6, 9}), CUDAPlace());
     Tensor slice_tensor = src_tensor.Slice(2, 6);
     DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 2);
@@ -176,11 +176,11 @@ TEST(Tensor, Slice) {
     uintptr_t src_data_address =
         reinterpret_cast<uintptr_t>(src_tensor.data<double>());
     uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
-        src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace()));
+        src_tensor.mutable_data<double>(src_tensor.dims(), CUDAPlace()));
     uintptr_t slice_data_address =
         reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
     uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
-        slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace()));
+        slice_tensor.mutable_data<double>(slice_tensor.dims(), CUDAPlace()));
     EXPECT_EQ(src_data_address, src_mutable_data_address);
     EXPECT_EQ(slice_data_address, slice_mutable_data_address);
     EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
@@ -200,3 +200,12 @@ TEST(Tensor, ReshapeToMatrix) {
   ASSERT_EQ(res.dims()[0], 2 * 3);
   ASSERT_EQ(res.dims()[1], 4 * 9);
 }
+
+TEST(Tensor, Layout) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor src;
+  ASSERT_EQ(src.layout(), DataLayout::kNHWC);
+  src.set_layout(DataLayout::kAnyLayout);
+  ASSERT_EQ(src.layout(), DataLayout::kAnyLayout);
+}
diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h
index 4e34b90d57eed8fea84b83045df61a98483c8849..ea4e4f22ea82ccc9f8b683d2fd773a7bc37f78a3 100644
--- a/paddle/framework/tensor_util.h
+++ b/paddle/framework/tensor_util.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/tensor.h"
@@ -33,6 +33,7 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
   src.check_memory_size();
 
   dst->Resize(src.dims());
+  dst->set_layout(src.layout());
   auto src_place = src.place();
   auto src_ptr = src.data<void>();
 
@@ -47,11 +48,11 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
 #ifdef PADDLE_WITH_CUDA
   else if (platform::is_gpu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
     auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
     auto ctx_place = ctx.GetPlace();
     PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
     PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
     memory::Copy(
         dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
@@ -59,21 +60,21 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
   } else if (platform::is_cpu_place(src_place) &&
              platform::is_gpu_place(dst_place)) {
     auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
     auto ctx_place = ctx.GetPlace();
     PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
     PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
     memory::Copy(
         dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   } else if (platform::is_gpu_place(src_place) &&
              platform::is_gpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
     auto ctx_place = ctx.GetPlace();
     PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
     PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
     memory::Copy(
         dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
@@ -82,6 +83,29 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
 #endif
 }
 
+/**
+ * @brief CopyFrom support CPU <-> CPU
+ */
+inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
+                     Tensor* dst) {
+  src.check_memory_size();
+  dst->Resize(src.dims());
+  dst->set_layout(src.layout());
+
+  auto src_place = src.place();
+  auto src_ptr = src.data<void>();
+
+  auto dst_ptr = dst->mutable_data(dst_place, src.type());
+
+  auto size = src.numel() * SizeOfType(src.type());
+
+  PADDLE_ENFORCE(platform::is_cpu_place(src_place) &&
+                 platform::is_cpu_place(dst_place));
+
+  memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+               boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+}
+
 /**
  * @brief   Copy the content of an external vector to a tensor.
  *
@@ -108,13 +132,28 @@ inline void CopyFromVector(const std::vector<T>& src,
 #ifdef PADDLE_WITH_CUDA
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
     memory::Copy(
-        boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr,
+        boost::get<platform::CUDAPlace>(dst_place), dst_ptr, src_place, src_ptr,
         size,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   }
 #endif
 }
 
+/**
+ * @brief CopyFromVector CPU vector -> CPU Tensor
+ */
+template <typename T>
+inline void CopyFromVector(const std::vector<T>& src, Tensor* dst) {
+  platform::CPUPlace dst_place = platform::CPUPlace();
+  auto src_ptr = static_cast<const void*>(src.data());
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>(dst_place));
+  auto size = src.size() * sizeof(T);
+
+  memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+}
+
 /**
  * @brief   Copy the content of a tensor to a vector
  *
@@ -141,12 +180,30 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
 #ifdef PADDLE_WITH_CUDA
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
     memory::Copy(
-        dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()),
+        dst_place, dst_ptr, boost::get<platform::CUDAPlace>(src.place()),
         src_ptr, size,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   }
 #endif
 }
 
+/**
+ * @brief CopyToVector CPUTensor <-> CPU Vector
+ */
+template <typename T>
+inline void CopyToVector(const Tensor& src, std::vector<T>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+  auto size = src.numel() * sizeof(T);
+
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(dst->data());
+
+  PADDLE_ENFORCE(platform::is_cpu_place(src.place()));
+
+  memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()),
+               src_ptr, size);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc
index 03a70de182d0eb499a81413d38229c81c4378b91..f388c19f28ed28335818733f946d8eaf18464627 100644
--- a/paddle/framework/tensor_util_test.cc
+++ b/paddle/framework/tensor_util_test.cc
@@ -17,6 +17,7 @@
 
 namespace paddle {
 namespace framework {
+
 TEST(CopyFrom, Tensor) {
   Tensor src_tensor;
   Tensor dst_tensor;
@@ -27,9 +28,10 @@ TEST(CopyFrom, Tensor) {
 
   int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
   memcpy(src_ptr, arr, 9 * sizeof(int));
+  src_tensor.set_layout(DataLayout::kAnyLayout);
 
   auto cpu_place = new platform::CPUPlace();
-  CopyFrom(src_tensor, *cpu_place, cpu_ctx, &dst_tensor);
+  CopyFrom(src_tensor, *cpu_place, &dst_tensor);
 
   const int* dst_ptr = dst_tensor.data<int>();
   ASSERT_NE(src_ptr, dst_ptr);
@@ -37,14 +39,18 @@ TEST(CopyFrom, Tensor) {
     EXPECT_EQ(src_ptr[i], dst_ptr[i]);
   }
 
+  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
+
   Tensor slice_tensor = src_tensor.Slice(1, 2);
-  CopyFrom(slice_tensor, *cpu_place, cpu_ctx, &dst_tensor);
+  CopyFrom(slice_tensor, *cpu_place, &dst_tensor);
   const int* slice_ptr = slice_tensor.data<int>();
   dst_ptr = dst_tensor.data<int>();
   ASSERT_NE(dst_ptr, slice_ptr);
   for (size_t i = 0; i < 3; ++i) {
     EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
   }
+  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
+
 #ifdef PADDLE_WITH_CUDA
   {
     Tensor src_tensor;
@@ -58,7 +64,7 @@ TEST(CopyFrom, Tensor) {
     memcpy(src_ptr, arr, 9 * sizeof(int));
 
     // CPU Tensor to GPU Tensor
-    auto gpu_place = new platform::GPUPlace(0);
+    auto gpu_place = new platform::CUDAPlace(0);
     platform::CUDADeviceContext gpu_ctx(*gpu_place);
     CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
 
@@ -90,6 +96,8 @@ TEST(CopyFrom, Tensor) {
     for (size_t i = 0; i < 3; ++i) {
       EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
     }
+
+    EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
   }
 #endif
 }
@@ -104,8 +112,7 @@ TEST(CopyFromVector, Tensor) {
     // Copy to CPU Tensor
     cpu_tensor.Resize(make_ddim({3, 3}));
     auto cpu_place = new paddle::platform::CPUPlace();
-    CPUDeviceContext cpu_ctx(*cpu_place);
-    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    CopyFromVector<int>(src_vec, &cpu_tensor);
 
     // Compare Tensors
     const int* cpu_ptr = cpu_tensor.data<int>();
@@ -117,7 +124,7 @@ TEST(CopyFromVector, Tensor) {
 
     src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
     cpu_tensor.Resize(make_ddim({2, 2}));
-    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    CopyFromVector<int>(src_vec, &cpu_tensor);
     cpu_ptr = cpu_tensor.data<int>();
     src_ptr = src_vec.data();
     ASSERT_NE(src_ptr, cpu_ptr);
@@ -143,7 +150,7 @@ TEST(CopyFromVector, Tensor) {
 
     // Copy to GPUTensor
     gpu_tensor.Resize(make_ddim({3, 3}));
-    auto gpu_place = new paddle::platform::GPUPlace();
+    auto gpu_place = new paddle::platform::CUDAPlace();
     CUDADeviceContext gpu_ctx(*gpu_place);
     CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
     // Copy from GPU to CPU tensor for comparison
@@ -198,9 +205,8 @@ TEST(CopyToVector, Tensor) {
     }
 
     CPUPlace place;
-    CPUDeviceContext cpu_ctx(place);
     std::vector<int> dst;
-    CopyToVector<int>(src, cpu_ctx, &dst);
+    CopyToVector<int>(src, &dst);
 
     for (int i = 0; i < 3 * 3; ++i) {
       EXPECT_EQ(src_ptr[i], dst[i]);
@@ -210,7 +216,7 @@ TEST(CopyToVector, Tensor) {
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     Tensor gpu_tensor;
-    GPUPlace place;
+    CUDAPlace place;
     CUDADeviceContext gpu_ctx(place);
     CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
 
diff --git a/paddle/framework/threadpool.cc b/paddle/framework/threadpool.cc
new file mode 100644
index 0000000000000000000000000000000000000000..109a7e7dc440d91e8223f2c0924f489f54a06f64
--- /dev/null
+++ b/paddle/framework/threadpool.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/threadpool.h"
+
+namespace paddle {
+namespace framework {
+
+std::unique_ptr<ThreadPool> ThreadPool::threadpool(nullptr);
+std::once_flag ThreadPool::init_flag;
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f6b2d458f7ee764c22d203f285b78023b6012f3
--- /dev/null
+++ b/paddle/framework/threadpool.h
@@ -0,0 +1,156 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+#include <queue>
+#include <thread>
+
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+typedef std::function<void()> Task;
+
+class ThreadPool {
+ public:
+  /**
+   * @brief   Get a instance of threadpool, the thread number will
+   *          be specified as the number of hardware thread contexts
+   */
+  static ThreadPool* GetInstance() {
+    std::call_once(init_flag, &ThreadPool::Init);
+    return threadpool.get();
+  }
+
+  ~ThreadPool() {
+    {
+      // notify all threads to stop running
+      running_ = false;
+      scheduled_.notify_all();
+    }
+
+    for (auto& t : threads_) {
+      t->join();
+      t.reset(nullptr);
+    }
+  }
+
+  int GetNumThreads() const { return num_threads_; }
+
+  int GetAvailable() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return available_;
+  }
+
+  /**
+   * @brief   Push a function to the queue, and will be scheduled and
+   *          executed if a thread is available.
+   * @param[in] Task  will be pushed to the task queue.
+   */
+  void Run(const Task& fn) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    tasks_.push(fn);
+    lock.unlock();
+    scheduled_.notify_one();
+  }
+
+  /**
+   * @brief   Wait until all the tasks are completed.
+   */
+  void Wait() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    completed_.wait(lock, [=] { return Done() == true; });
+  }
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(ThreadPool);
+
+  explicit ThreadPool(int num_threads)
+      : num_threads_(num_threads), available_(num_threads), running_(true) {
+    threads_.resize(num_threads);
+    for (auto& thread : threads_) {
+      // TODO(Yancey1989): binding the thread on the specify CPU number
+      thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
+    }
+  }
+
+  /**
+   * @brief   If the task queue is empty and avaialbe
+   *          is equal to the number of threads, means that
+   *          all tasks are completed.
+   *
+   *          Note: this function is not thread-safe.
+   *
+   * @return true if all tasks are completed.
+   */
+  bool Done() { return tasks_.empty() && available_ == num_threads_; }
+
+  void TaskLoop() {
+    while (running_) {
+      std::unique_lock<std::mutex> lock(mutex_);
+      scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
+
+      if (!running_) {
+        break;
+      }
+      // pop a task from the task queue
+      auto task = tasks_.front();
+      tasks_.pop();
+
+      --available_;
+      lock.unlock();
+
+      // run the task
+      task();
+
+      {
+        std::unique_lock<std::mutex> lock(mutex_);
+        ++available_;
+        if (Done()) {
+          completed_.notify_all();
+        }
+      }
+    }
+  }
+
+  static void Init() {
+    if (threadpool.get() == nullptr) {
+      // TODO(Yancey1989): specify the max threads number
+      int num_threads = std::thread::hardware_concurrency();
+      PADDLE_ENFORCE_GT(num_threads, 0);
+      threadpool.reset(new ThreadPool(num_threads));
+    }
+  }
+
+ private:
+  static std::unique_ptr<ThreadPool> threadpool;
+  static std::once_flag init_flag;
+
+  int num_threads_;
+  int available_;
+  bool running_;
+  std::queue<Task> tasks_;
+  std::vector<std::unique_ptr<std::thread>> threads_;
+  std::mutex mutex_;
+  std::condition_variable scheduled_;
+  std::condition_variable completed_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/threadpool_test.cc b/paddle/framework/threadpool_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..012d92a5edc415f0bb2f8a0ea38ffeb9549d54fa
--- /dev/null
+++ b/paddle/framework/threadpool_test.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <atomic>
+
+#include "threadpool.h"
+
+namespace framework = paddle::framework;
+
+void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
+  for (int i = 0; i < cnt; ++i) {
+    pool->Run([&sum]() { sum.fetch_add(1); });
+  }
+}
+
+TEST(ThreadPool, ConcurrentInit) {
+  framework::ThreadPool* pool;
+  int concurrent_cnt = 50;
+  std::vector<std::thread> threads;
+  for (int i = 0; i < concurrent_cnt; ++i) {
+    std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); });
+    threads.push_back(std::move(t));
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+TEST(ThreadPool, ConcurrentStart) {
+  framework::ThreadPool* pool = framework::ThreadPool::GetInstance();
+  std::atomic<int> sum(0);
+  std::vector<std::thread> threads;
+  int concurrent_cnt = 50;
+  // sum = (n * (n + 1)) / 2
+  for (int i = 1; i <= concurrent_cnt; ++i) {
+    std::thread t(do_sum, pool, std::ref(sum), i);
+    threads.push_back(std::move(t));
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  pool->Wait();
+  EXPECT_EQ(sum, ((concurrent_cnt + 1) * concurrent_cnt) / 2);
+}
diff --git a/paddle/framework/type_defs.h b/paddle/framework/type_defs.h
index baeb98c9bd49ec65da5931bcbe33ab788f86f3e8..d834d343759fa279a1444c6337956ffce1b9061a 100644
--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <functional>
@@ -25,11 +25,9 @@
 namespace paddle {
 namespace framework {
 class OperatorBase;
-class OpDescBind;
-class BlockDescBind;
-class BlockDesc;
+class OpDesc;
 class InferShapeContext;
-class BlockDescBind;
+class BlockDesc;
 
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 
@@ -37,7 +35,7 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using Attribute =
     boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                    std::vector<float>, std::vector<std::string>, bool,
-                   std::vector<bool>, BlockDescBind*>;
+                   std::vector<bool>, BlockDesc*>;
 
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
@@ -45,13 +43,13 @@ using OpCreator = std::function<OperatorBase*(
     const std::string& /*type*/, const VariableNameMap& /*inputs*/,
     const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
 
-using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDescBind>>(
-    const OpDescBind&, const std::unordered_set<std::string>& /*no_grad_set*/,
+using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDesc>>(
+    const OpDesc&, const std::unordered_set<std::string>& /*no_grad_set*/,
     std::unordered_map<std::string, std::string>* /*grad_to_var*/,
-    const std::vector<BlockDescBind*>& grad_block)>;
+    const std::vector<BlockDesc*>& grad_block)>;
 
-using InferVarTypeFN = std::function<void(const OpDescBind& /*op_desc*/,
-                                          BlockDescBind* /*block*/)>;
+using InferVarTypeFN =
+    std::function<void(const OpDesc& /*op_desc*/, BlockDesc* /*block*/)>;
 
 using InferShapeFN = std::function<void(InferShapeContext*)>;
 
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index 0babec29f6f4412ed29deeafe24470e86b30a636..7d002b9ea0b597730685ee03b021c4982f787f49 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -18,30 +18,32 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-VarDesc::VarType VarDescBind::GetType() const { return desc_.type(); }
+proto::VarDesc::VarType VarDesc::GetType() const { return desc_.type(); }
 
-void VarDescBind::SetType(VarDesc::VarType type) { desc_.set_type(type); }
+void VarDesc::SetType(proto::VarDesc::VarType type) { desc_.set_type(type); }
 
-void VarDescBind::SetShape(const std::vector<int64_t> &dims) {
+void VarDesc::SetShape(const std::vector<int64_t> &dims) {
   VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
 }
 
-void VarDescBind::SetDataType(DataType data_type) {
+void VarDesc::SetDataType(proto::DataType data_type) {
   mutable_tensor_desc()->set_data_type(data_type);
 }
 
-std::vector<int64_t> VarDescBind::Shape() const {
+std::vector<int64_t> VarDesc::Shape() const {
   return RepeatedToVector(tensor_desc().dims());
 }
 
-DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); }
+proto::DataType VarDesc::GetDataType() const {
+  return tensor_desc().data_type();
+}
 
-void VarDescBind::SetLoDLevel(int32_t lod_level) {
+void VarDesc::SetLoDLevel(int32_t lod_level) {
   switch (desc_.type()) {
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
       desc_.mutable_lod_tensor()->set_lod_level(lod_level);
       break;
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
       desc_.mutable_tensor_array()->set_lod_level(lod_level);
       break;
     default:
@@ -50,11 +52,11 @@ void VarDescBind::SetLoDLevel(int32_t lod_level) {
   }
 }
 
-int32_t VarDescBind::GetLodLevel() const {
+int32_t VarDesc::GetLodLevel() const {
   switch (desc_.type()) {
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
       return desc_.lod_tensor().lod_level();
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.tensor_array().lod_level();
     default:
       PADDLE_THROW("Tensor type=%d does not support LoDLevel",
@@ -62,29 +64,29 @@ int32_t VarDescBind::GetLodLevel() const {
   }
 }
 
-const TensorDesc &VarDescBind::tensor_desc() const {
+const proto::TensorDesc &VarDesc::tensor_desc() const {
   PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type");
   switch (desc_.type()) {
-    case VarDesc::SELECTED_ROWS:
+    case proto::VarDesc::SELECTED_ROWS:
       return desc_.selected_rows();
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
       return desc_.lod_tensor().tensor();
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.tensor_array().tensor();
     default:
-      PADDLE_THROW("Unexpected branch.");
+      PADDLE_THROW("The type of var '", this->Name(), "' is unsupported.");
   }
 }
 
-TensorDesc *VarDescBind::mutable_tensor_desc() {
+proto::TensorDesc *VarDesc::mutable_tensor_desc() {
   PADDLE_ENFORCE(desc_.has_type(),
                  "invoke MutableTensorDesc must after set type");
   switch (desc_.type()) {
-    case VarDesc::SELECTED_ROWS:
+    case proto::VarDesc::SELECTED_ROWS:
       return desc_.mutable_selected_rows();
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
       return desc_.mutable_lod_tensor()->mutable_tensor();
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.mutable_tensor_array()->mutable_tensor();
     default:
       PADDLE_THROW("Unexpected branch.");
diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h
index 5cf4608944c5011d798fbde060002a57be8f6102..4fd2abe7fb215c3ac454de3e30754685111eb570 100644
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -53,44 +53,44 @@ inline void VectorToRepeated(const std::vector<bool> &vec,
   }
 }
 
-class VarDescBind {
+class VarDesc {
  public:
-  explicit VarDescBind(const std::string &name) {
+  explicit VarDesc(const std::string &name) {
     desc_.set_name(name);
-    desc_.set_type(VarDesc::LOD_TENSOR);
+    desc_.set_type(proto::VarDesc::LOD_TENSOR);
   }
 
-  explicit VarDescBind(const VarDesc &desc) : desc_(desc) {}
+  explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) {}
 
-  VarDesc *Proto() { return &desc_; }
+  proto::VarDesc *Proto() { return &desc_; }
 
   std::string Name() const { return desc_.name(); }
 
   void SetShape(const std::vector<int64_t> &dims);
 
-  void SetDataType(DataType data_type);
+  void SetDataType(proto::DataType data_type);
 
   std::vector<int64_t> Shape() const;
 
-  DataType GetDataType() const;
+  proto::DataType GetDataType() const;
 
   void SetLoDLevel(int32_t lod_level);
 
   int32_t GetLodLevel() const;
 
-  VarDesc::VarType GetType() const;
+  proto::VarDesc::VarType GetType() const;
 
-  void SetType(VarDesc::VarType type);
+  void SetType(proto::VarDesc::VarType type);
 
   bool Persistable() const { return desc_.persistable(); }
 
   void SetPersistable(bool persistable) { desc_.set_persistable(persistable); }
 
  private:
-  const TensorDesc &tensor_desc() const;
-  TensorDesc *mutable_tensor_desc();
+  const proto::TensorDesc &tensor_desc() const;
+  proto::TensorDesc *mutable_tensor_desc();
 
-  VarDesc desc_;
+  proto::VarDesc desc_;
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/var_type.h b/paddle/framework/var_type.h
index 0f19870bec3e69d07278507cc556a86bbd25d12d..0e6ea8dc69fe9b7fdaa1163b8d63295624cd3abc 100644
--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/framework.pb.h"
@@ -20,15 +20,15 @@
 
 namespace paddle {
 namespace framework {
-inline VarDesc::VarType ToVarType(std::type_index type) {
+inline proto::VarDesc::VarType ToVarType(std::type_index type) {
   if (type.hash_code() == typeid(LoDTensor).hash_code()) {
-    return VarDesc_VarType_LOD_TENSOR;
+    return proto::VarDesc_VarType_LOD_TENSOR;
   } else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
-    return VarDesc_VarType_LOD_RANK_TABLE;
+    return proto::VarDesc_VarType_LOD_RANK_TABLE;
   } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
-    return VarDesc_VarType_LOD_TENSOR_ARRAY;
+    return proto::VarDesc_VarType_LOD_TENSOR_ARRAY;
   } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
-    return VarDesc_VarType_SELECTED_ROWS;
+    return proto::VarDesc_VarType_SELECTED_ROWS;
   } else {
     PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
   }
@@ -37,16 +37,16 @@ inline VarDesc::VarType ToVarType(std::type_index type) {
 template <typename Visitor>
 inline void VisitVarType(const Variable& var, Visitor visitor) {
   switch (ToVarType(var.Type())) {
-    case VarDesc_VarType_LOD_TENSOR:
+    case proto::VarDesc_VarType_LOD_TENSOR:
       visitor(var.Get<framework::LoDTensor>());
       return;
-    case VarDesc_VarType_LOD_RANK_TABLE:
+    case proto::VarDesc_VarType_LOD_RANK_TABLE:
       visitor(var.Get<LoDRankTable>());
       return;
-    case VarDesc_VarType_LOD_TENSOR_ARRAY:
+    case proto::VarDesc_VarType_LOD_TENSOR_ARRAY:
       visitor(var.Get<LoDTensorArray>());
       return;
-    case VarDesc_VarType_SELECTED_ROWS:
+    case proto::VarDesc_VarType_SELECTED_ROWS:
       visitor(var.Get<SelectedRows>());
       return;
     default:
diff --git a/paddle/framework/var_type_inference.h b/paddle/framework/var_type_inference.h
index 32abbeb33479444c5e7a9889f4211f59af07f98f..6c11f2fee7f554fb008f559bb33aeafea5c5a556 100644
--- a/paddle/framework/var_type_inference.h
+++ b/paddle/framework/var_type_inference.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/type_defs.h"
@@ -21,8 +21,7 @@ namespace framework {
 class VarTypeInference {
  public:
   virtual ~VarTypeInference() {}
-  virtual void operator()(const OpDescBind& op_desc,
-                          BlockDescBind* block) const = 0;
+  virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/var_type_inference_test.cc b/paddle/framework/var_type_inference_test.cc
index 9035e63fa48ffdf7c72061b0a4248538d7a357e4..fa6018b1c583abaccb0259b82c9bb61c0fc10820 100644
--- a/paddle/framework/var_type_inference_test.cc
+++ b/paddle/framework/var_type_inference_test.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/var_type_inference.h"
 #include "gtest/gtest.h"
@@ -33,17 +33,16 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
 
 class SumOpVarTypeInference : public VarTypeInference {
  public:
-  void operator()(const OpDescBind &op_desc,
-                  BlockDescBind *block) const override {
+  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
     auto &inputs = op_desc.Input("X");
-    auto default_var_type = VarDesc::SELECTED_ROWS;
+    auto default_var_type = proto::VarDesc::SELECTED_ROWS;
 
     bool any_input_is_lod_tensor = std::any_of(
         inputs.begin(), inputs.end(), [block](const std::string &name) {
-          return block->Var(name)->GetType() == VarDesc::LOD_TENSOR;
+          return block->Var(name)->GetType() == proto::VarDesc::LOD_TENSOR;
         });
     if (any_input_is_lod_tensor) {
-      default_var_type = VarDesc::LOD_TENSOR;
+      default_var_type = proto::VarDesc::LOD_TENSOR;
     }
 
     auto out_var_name = op_desc.Output("Out").front();
@@ -62,43 +61,43 @@ namespace paddle {
 namespace framework {
 
 TEST(InferVarType, sum_op) {
-  ProgramDescBind prog;
+  ProgramDesc prog;
   auto *op = prog.MutableBlock(0)->AppendOp();
   op->SetType("sum");
   op->SetInput("X", {"test_a", "test_b", "test_c"});
   op->SetOutput("Out", {"test_out"});
 
-  prog.MutableBlock(0)->Var("test_a")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_b")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_c")->SetType(VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarDesc::SELECTED_ROWS);
   prog.MutableBlock(0)->Var("test_out");
 
   op->InferVarType(prog.MutableBlock(0));
 
-  ASSERT_EQ(VarDesc::SELECTED_ROWS,
+  ASSERT_EQ(proto::VarDesc::SELECTED_ROWS,
             prog.MutableBlock(0)->Var("test_out")->GetType());
 
-  prog.MutableBlock(0)->Var("test_b")->SetType(VarDesc::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::LOD_TENSOR);
   op->InferVarType(prog.MutableBlock(0));
-  ASSERT_EQ(VarDesc::LOD_TENSOR,
+  ASSERT_EQ(proto::VarDesc::LOD_TENSOR,
             prog.MutableBlock(0)->Var("test_out")->GetType());
 }
 
 TEST(InferVarType, sum_op_without_infer_var_type) {
-  ProgramDescBind prog;
+  ProgramDesc prog;
   auto *op = prog.MutableBlock(0)->AppendOp();
   op->SetType("sum_without_infer_var_type");
   op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
   op->SetOutput("Out", {"test2_out"});
 
-  prog.MutableBlock(0)->Var("test2_a")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test2_b")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test2_c")->SetType(VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarDesc::SELECTED_ROWS);
   prog.MutableBlock(0)->Var("test2_out");
 
   op->InferVarType(prog.MutableBlock(0));
 
-  ASSERT_EQ(VarDesc_VarType_LOD_TENSOR,
+  ASSERT_EQ(proto::VarDesc_VarType_LOD_TENSOR,
             prog.MutableBlock(0)->Var("test2_out")->GetType());
 }
 
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index 8d34eee886a6202691e5dec2ab62e7c5b0ac7fb1..de7b70e271b38ebe3a4c38704d0cced47d010788 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -126,6 +126,11 @@ public:
       inputData += inputChannels * inputHeight * inputWidth;
       outputData += outputChannels * outputHeight * outputWidth;
     }
+#ifdef PADDLE_MOBILE_INFERENCE
+    if (Device == DEVICE_TYPE_CPU) {
+      memory_.reset();
+    }
+#endif
   }
 };
 
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
index 741984bb68d3881f6ac26eaca7790190ed6e572a..ac217f1363dbd0360645bbe07cd71a17cc931a79 100644
--- a/paddle/gserver/layers/MKLDNNLRNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
@@ -29,7 +29,7 @@ bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
   }
 
   /* the size of inputs for norm-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1UL);
+  CHECK_EQ(config_.inputs_size(), 1);
   const NormConfig& conf = config_.inputs(0).norm_conf();
   localSize_ = conf.size();
   alpha_ = conf.scale();
diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/gserver/layers/ROIPoolLayer.cpp
index 2c8256b91c97b513ce7237b8174c522430094926..7d7c30b4d89e2dd137e7fc7de3159c07bbab9fb4 100644
--- a/paddle/gserver/layers/ROIPoolLayer.cpp
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
@@ -84,12 +84,15 @@ void ROIPoolLayer::forward(PassType passType) {
   size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
 
   real* outputData = outputValue->getData();
-  Matrix::resizeOrCreate(maxIdxs_,
-                         numROIs,
-                         channels_ * pooledHeight_ * pooledWidth_,
-                         false,
-                         false);
-  real* argmaxData = maxIdxs_->getData();
+  real* argmaxData = nullptr;
+  if (passType != PASS_TEST) {
+    Matrix::resizeOrCreate(maxIdxs_,
+                           numROIs,
+                           channels_ * pooledHeight_ * pooledWidth_,
+                           false,
+                           false);
+    argmaxData = maxIdxs_->getData();
+  }
 
   for (size_t n = 0; n < numROIs; ++n) {
     // the first five elememts of each RoI should be:
@@ -128,14 +131,18 @@ void ROIPoolLayer::forward(PassType passType) {
           bool isEmpty = (hend <= hstart) || (wend <= wstart);
           size_t poolIndex = ph * pooledWidth_ + pw;
           outputData[poolIndex] = isEmpty ? 0 : -FLT_MAX;
-          argmaxData[poolIndex] = -1;
+          if (argmaxData) {
+            argmaxData[poolIndex] = -1;
+          }
 
           for (size_t h = hstart; h < hend; ++h) {
             for (size_t w = wstart; w < wend; ++w) {
               size_t index = h * width_ + w;
               if (batchData[index] > outputData[poolIndex]) {
                 outputData[poolIndex] = batchData[index];
-                argmaxData[poolIndex] = index;
+                if (argmaxData) {
+                  argmaxData[poolIndex] = index;
+                }
               }
             }
           }
@@ -143,7 +150,9 @@ void ROIPoolLayer::forward(PassType passType) {
       }
       batchData += channelOffset;
       outputData += poolChannelOffset;
-      argmaxData += poolChannelOffset;
+      if (argmaxData) {
+        argmaxData += poolChannelOffset;
+      }
     }
     bottomROIs += roiOffset;
   }
diff --git a/paddle/gserver/layers/SequenceToBatch.cpp b/paddle/gserver/layers/SequenceToBatch.cpp
index 5fa7b6f4881b9582b540a5b1bfe849220cc2a4ea..6b769378d24838364701d0f128a7308c6195cc41 100644
--- a/paddle/gserver/layers/SequenceToBatch.cpp
+++ b/paddle/gserver/layers/SequenceToBatch.cpp
@@ -171,12 +171,31 @@ void SequenceToBatch::sequence2BatchCopy(Matrix &batch,
     hl_sequence2batch_copy(
         batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
   } else {
-    for (int i = 0; i < batchCount; ++i) {
-      if (seq2batch) {
+    if (seq2batch) {
+#ifdef PADDLE_USE_MKLML
+      const int blockMemSize = 8 * 1024;
+      const int blockSize = blockMemSize / sizeof(real);
+#pragma omp parallel for collapse(2)
+      for (int i = 0; i < batchCount; ++i) {
+        for (int j = 0; j < seqWidth; j += blockSize) {
+          memcpy(batch.rowBuf(i) + j,
+                 sequence.rowBuf(idxData[i]) + j,
+                 (j + blockSize > seqWidth) ? (seqWidth - j) * sizeof(real)
+                                            : blockMemSize);
+        }
+      }
+#else
+      for (int i = 0; i < batchCount; ++i) {
         memcpy(batch.rowBuf(i),
                sequence.rowBuf(idxData[i]),
                seqWidth * sizeof(real));
-      } else {
+      }
+#endif
+    } else {
+#ifdef PADDLE_USE_MKLML
+#pragma omp parallel for
+#endif
+      for (int i = 0; i < batchCount; ++i) {
         memcpy(sequence.rowBuf(idxData[i]),
                batch.rowBuf(i),
                seqWidth * sizeof(real));
diff --git a/paddle/math/float16.h b/paddle/math/float16.h
index 76ad3a01239e409caeefc36a3d562ed5e388dc92..efebbce50405018c6b7ce2049f8d55c33680469f 100644
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -79,7 +79,7 @@ public:
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline explicit float16(const half& h) {
 #if CUDA_VERSION >= 9000
-    x = reinterpret_cast<__half_raw*>(&h)->x;
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
 #else
     x = h.x;
 #endif  // CUDA_VERSION >= 9000
@@ -145,7 +145,7 @@ public:
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline float16& operator=(const half& rhs) {
 #if CUDA_VERSION >= 9000
-    x = reinterpret_cast<__half_raw*>(&rhs)->x;
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
 #else
     x = rhs.x;
 #endif
diff --git a/paddle/memory/README.md b/paddle/memory/README.md
index 6cb003c50bc7d142d65b0591e7e5235431d2ea42..7cf61d089b39041b7a15184e0ea9211d14a66f5e 100644
--- a/paddle/memory/README.md
+++ b/paddle/memory/README.md
@@ -12,13 +12,13 @@ p = memory::Alloc(platform::CPUPlace(), 4*1024);
 To allocate 4KB memory on the 3rd GPU:
 
 ```cpp
-p = memory::Alloc(platform::GPUPlace(2), 4*1024);
+p = memory::Alloc(platform::CUDAPlace(2), 4*1024);
 ```
 
 To free memory and check the so-far used amount of memory on a place:
 
 ```cpp
-auto pl = platform::GPUPlace(0);
+auto pl = platform::CUDAPlace(0);
 p = memory::Alloc(pl, 4*1024);
 cout << memory::Used(pl);
 memory::Free(pl, p);
@@ -36,7 +36,7 @@ template <typename Place> size_t Used(Place);
 }  // namespace memory
 ```
 
-These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
+These function templates have specializations on either `platform::CPUPlace` or `platform::CUDAPlace`:
 
 ```cpp
 template<>
@@ -49,7 +49,7 @@ and
 
 ```cpp
 template<>
-void Alloc<GPUPlace>(GPUPlace p, size_t size) {
+void Alloc<CUDAPlace>(CUDAPlace p, size_t size) {
   return GetGPUBuddyAllocator(p.id)->Alloc(size);
 }
 ```
@@ -122,7 +122,7 @@ There are two implementations of `Context`:
 
 1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
 
-1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202).  This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member.   `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
+1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202).  This looks very similar to class `majel::CUDAPlace`, who also has an `int id_` data member.   `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
 
 ### Majel
 
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index 64ee53803891f192302bb915027f0499dfa36411..2bc2c06a15702b29c8bdf755978aebe0e6219b4a 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/memory/detail/buddy_allocator.h"
 #include "glog/logging.h"
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 9c41378483993101a098fc4ad1068c1ef908e566..4e0135dd655d04b7f99722a2159795738c1b29c7 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
index fc40993208323f1f5d18103165c8835b5f829613..f50eceba096477d7b2f50f7c406770c8e9595332 100644
--- a/paddle/memory/detail/memory_block.cc
+++ b/paddle/memory/detail/memory_block.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/memory/detail/memory_block.h"
 #include "paddle/memory/detail/meta_cache.h"
diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h
index a5168b519f3a3747f34ef2ea7b87d72dce70064d..a4ca51b31b0df269b4b48f824bfd2b74f2a059fb 100644
--- a/paddle/memory/detail/memory_block.h
+++ b/paddle/memory/detail/memory_block.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/memory/detail/meta_cache.cc b/paddle/memory/detail/meta_cache.cc
index 7e2f92b00ca5d787c1114176c5dc3304ca3ebe26..2bacca75108f9f80e7aa291fcb4fd66112201394 100644
--- a/paddle/memory/detail/meta_cache.cc
+++ b/paddle/memory/detail/meta_cache.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/memory/detail/meta_cache.h"
 #include "glog/logging.h"
diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h
index cf5815644284c23a1d2abc904f8c5053ce107a72..db8ffd49ae30cf72ca691894af2df08a7106d02f 100644
--- a/paddle/memory/detail/meta_cache.h
+++ b/paddle/memory/detail/meta_cache.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/memory/detail/meta_data.cc b/paddle/memory/detail/meta_data.cc
index 70c5c1f439e84ec33cf0507beae33f9cdfa51727..dc57d4d2376ab1cfe6fb49c92af4591b3972a53a 100644
--- a/paddle/memory/detail/meta_data.cc
+++ b/paddle/memory/detail/meta_data.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/memory/detail/meta_data.h"
 
diff --git a/paddle/memory/detail/meta_data.h b/paddle/memory/detail/meta_data.h
index 628cf1f2e347e288d1bf34c14c7b2f13a28d3662..6b83c42eb851f0487bed6d625d848cf90db00929 100644
--- a/paddle/memory/detail/meta_data.h
+++ b/paddle/memory/detail/meta_data.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 6a815a1b57db1d833781ca224f34e4559af9b9a5..509250debc2b2fd2e87078ab5f233ae2db6fd898 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock
+#include <algorithm>   // for std::max
 
 #include "gflags/gflags.h"
 
@@ -28,7 +29,7 @@ limitations under the License. */
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
 DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
-
+DECLARE_double(fraction_of_gpu_memory_to_use);
 namespace paddle {
 namespace memory {
 namespace detail {
@@ -77,45 +78,20 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
   // if size is 0.  We just make sure it does.
   if (size <= 0) return nullptr;
-
-  size_t available = 0;
-  size_t capacity = 0;
-  paddle::platform::GpuMemoryUsage(available, capacity);
-
-  // Reserve memory for page tables, etc.
-  size_t reserving = 0.05 * capacity + paddle::platform::GpuMinChunkSize();
-  size_t usable = available > reserving ? available - reserving : 0;
-
-  // If remaining size no less than expected size, using general
-  // cudaMalloc to allocate GPU memory.
-  void* p = 0;
-  if (size <= usable) {
-    cudaError_t result = cudaMalloc(&p, size);
-    if (result == cudaSuccess) {
-      index = 0;
-      gpu_alloc_size_ += size;
-      return p;
-    }
-  }
-
-  // If remaining size less than expected size or cudaMalloc failed,
-  // cudaMallocHost will be considered as a fallback allocator.
-  //
-  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
-  // of host fallback allocation. Allocates too much would reduce
-  // the amount of memory available to the underlying system for paging.
-  usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
-
-  if (size > usable) return nullptr;
-
-  cudaError_t result = cudaMallocHost(&p, size);
+  void* p;
+  cudaError_t result = cudaMalloc(&p, size);
   if (result == cudaSuccess) {
-    index = 1;
-    fallback_alloc_size_ += size;
+    index = 0;
+    gpu_alloc_size_ += size;
     return p;
+  } else {
+    LOG(WARNING)
+        << "Cannot malloc " << size / 1024.0 / 1024.0
+        << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use "
+           "environment variable to a lower value. Current value is "
+        << FLAGS_fraction_of_gpu_memory_to_use;
+    return nullptr;
   }
-
-  return nullptr;
 }
 
 void GPUAllocator::Free(void* p, size_t size, size_t index) {
diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
index 1df88a6da9fb0c50d0d7ecd083c0533d8a886a67..b46141aafd7146bd3def12d86108c10f1f143d20 100644
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -28,31 +28,25 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
 
 #ifdef PADDLE_WITH_CUDA
 template <>
-void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::GPUPlace src_place,
-                                                  const void* src, size_t num,
-                                                  cudaStream_t stream) {
+void Copy<platform::CPUPlace, platform::CUDAPlace>(
+    platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
+    const void* src, size_t num, cudaStream_t stream) {
   platform::SetDeviceId(src_place.device);
   platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
 }
 
 template <>
-void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::CPUPlace src_place,
-                                                  const void* src, size_t num,
-                                                  cudaStream_t stream) {
+void Copy<platform::CUDAPlace, platform::CPUPlace>(
+    platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
+    const void* src, size_t num, cudaStream_t stream) {
   platform::SetDeviceId(dst_place.device);
   platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
 }
 
 template <>
-void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::GPUPlace src_place,
-                                                  const void* src, size_t num,
-                                                  cudaStream_t stream) {
+void Copy<platform::CUDAPlace, platform::CUDAPlace>(
+    platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
+    const void* src, size_t num, cudaStream_t stream) {
   if (dst_place == src_place) {
     platform::SetDeviceId(src_place.device);
     platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
@@ -62,33 +56,6 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
   }
 }
 
-template <>
-void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::GPUPlace src_place,
-                                                  const void* src, size_t num) {
-  platform::SetDeviceId(src_place.device);
-  platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
-}
-
-template <>
-void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::CPUPlace src_place,
-                                                  const void* src, size_t num) {
-  platform::SetDeviceId(dst_place.device);
-  platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
-}
-
-template <>
-void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::GPUPlace src_place,
-                                                  const void* src, size_t num) {
-  platform::SetDeviceId(dst_place.device);
-  platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
-}
-
 #endif
 
 }  // namespace memory
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 9cafdfda75d0511227ef648d50a8635320a81d32..c4bb6baee7ebf2941cee5915ca2723c298689261 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -83,12 +83,12 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
 }
 
 template <>
-size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
+size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
   return GetGPUBuddyAllocator(place.device)->Used();
 }
 
 template <>
-void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
+void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
   auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
   auto* ptr = buddy_allocator->Alloc(size);
   if (ptr == nullptr) {
@@ -101,14 +101,14 @@ void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
     LOG(WARNING) << "total " << total;
     LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
     LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
-    LOG(WARNING) << "GPU memory used: " << Used<platform::GPUPlace>(place);
+    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
     platform::SetDeviceId(cur_dev);
   }
   return ptr;
 }
 
 template <>
-void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
+void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
   GetGPUBuddyAllocator(place.device)->Free(p);
 }
 
diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 2444931e26774ae80b916fbb7bd46ff93025d9ed..f476bf71264da59a5c546968f4689145e1d8801b 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -82,7 +82,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
 
 #ifdef PADDLE_WITH_CUDA
 
-size_t align(size_t size, paddle::platform::GPUPlace place) {
+size_t align(size_t size, paddle::platform::CUDAPlace place) {
   size += sizeof(paddle::memory::detail::Metadata);
   size_t alignment = paddle::platform::GpuMinChunkSize();
   size_t remaining = size % alignment;
@@ -94,7 +94,7 @@ TEST(BuddyAllocator, GPUAllocation) {
 
   EXPECT_EQ(p, nullptr);
 
-  paddle::platform::GPUPlace gpu(0);
+  paddle::platform::CUDAPlace gpu(0);
   p = paddle::memory::Alloc(gpu, 4096);
 
   EXPECT_NE(p, nullptr);
@@ -103,7 +103,7 @@ TEST(BuddyAllocator, GPUAllocation) {
 }
 
 TEST(BuddyAllocator, GPUMultAlloc) {
-  paddle::platform::GPUPlace gpu;
+  paddle::platform::CUDAPlace gpu;
 
   std::unordered_map<void *, size_t> ps;
 
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 68346001b1fa3e4ff5823f822fa41c9ef8150f2e..1386146b01dbfb1d14c76335a0f40488a69361ef 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -1,5 +1,6 @@
 file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
+set(DEPS_OPS "")
 set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/pybind/pybind.h)
 file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
 function(op_library TARGET)
@@ -48,6 +49,10 @@ function(op_library TARGET)
         message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
     endif()
 
+    list(LENGTH op_library_DEPS op_library_DEPS_len)
+    if (${op_library_DEPS_len} GREATER 0)
+        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
+    endif()
     if (WITH_GPU)
         nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
@@ -212,25 +217,26 @@ set(DEPS_OPS
     send_op
     recv_op
     detection_output_op)
+if(WITH_GPU)
+    op_library(nccl_op DEPS nccl_common)
+else()
+    set(DEPS_OPS ${DEPS_OPS} nccl_op)
+endif()
 
 if(WITH_DISTRIBUTE)
-add_subdirectory(detail)
-op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
-set_source_files_properties(
-    send_op.cc
-    PROPERTIES
-    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-
-op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
-set_source_files_properties(
-    recv_op.cc
-    PROPERTIES
-    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-
-cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
+    add_subdirectory(detail)
+    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
+else()
+    set(DEPS_OPS ${DEPS_OPS} send_op recv_op)
 endif()
 
-op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
+op_library(cond_op DEPS framework_proto tensor net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
@@ -244,21 +250,16 @@ op_library(pool_op DEPS pooling)
 op_library(maxout_op DEPS maxouting)
 op_library(unpool_op DEPS unpooling)
 op_library(pool_with_index_op DEPS pooling)
-op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
-op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
-op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op)
-op_library(max_sequence_len_op SRCS max_sequence_len_op.cc DEPS lod_rank_table)
-op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
-if(WITH_GPU)
-op_library(nccl_op DEPS nccl_common)
-endif()
+op_library(lod_rank_table_op DEPS lod_rank_table)
+op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
+op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
+op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
 op_library(conv_transpose_op DEPS vol2col)
 op_library(gru_op DEPS sequence2batch gru_compute)
-op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
-
+op_library(recurrent_op DEPS executor)
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
@@ -271,13 +272,12 @@ endforeach()
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
 
-
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
 if(WITH_GPU)
-  cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+    cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 76da21c4726a1245241c1cf61860f9c8b62ea452..d7baa6e90538484b400f32587a052d394a8d10d5 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -53,7 +53,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
@@ -63,8 +63,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
 
 class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AccuracyOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  AccuracyOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     // TODO(typhoonzero): support both inference value and indices.
     AddInput("Out", "The network output of topk (inferences)");
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index 539a93530206c93a37791a9ccb2fb104af17f940..0aadd5af41531e54b357756441f92da668d4ec01 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -26,7 +26,7 @@ template <int BlockSize>
 __global__ void AccuracyCudaKernel(const int N, const int D,
                                    const int64_t* Xdata,
                                    const int64_t* labeldata, int* correct_data,
-                                   float* accuracy) {
+                                   float* accuracy, int* total_data) {
   int count = 0;
   __shared__ int total[BlockSize];
 
@@ -47,6 +47,7 @@ __global__ void AccuracyCudaKernel(const int N, const int D,
   if (threadIdx.x == 0) {
     *correct_data = result;
     *accuracy = static_cast<float>(result) / static_cast<float>(N);
+    *total_data = N;
   }
 }
 
@@ -55,7 +56,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
     auto* inference = ctx.Input<Tensor>("Out");
     auto* indices = ctx.Input<Tensor>("Indices");
     auto* label = ctx.Input<Tensor>("Label");
@@ -80,22 +81,11 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
     if (num_samples == 0) {
       return;
     }
-    platform::GpuMemcpyAsync(total_data, &num_samples, sizeof(int),
-                             cudaMemcpyHostToDevice, stream);
 
     AccuracyCudaKernel<
         PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
         num_samples, infer_width, indices_data, label_data, correct_data,
-        accuracy_data);
-
-    int d_num_samples, d_num_correct;
-    float d_accuracy;
-    platform::GpuMemcpyAsync(&d_num_correct, correct_data, sizeof(int),
-                             cudaMemcpyDeviceToHost, stream);
-    platform::GpuMemcpyAsync(&d_num_samples, total_data, sizeof(int),
-                             cudaMemcpyDeviceToHost, stream);
-    platform::GpuMemcpyAsync(&d_accuracy, accuracy_data, sizeof(float),
-                             cudaMemcpyDeviceToHost, stream);
+        accuracy_data, total_data);
   }
 };
 
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index 63490f0ec9f4852a3ead574b9d52c807d8ba6d89..4188858a90daf8b2c10eb6960393de977d467371 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/activation_op.h"
 
@@ -22,8 +22,8 @@ class ActivationOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Y");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -32,21 +32,20 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Y"));
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
   }
 };
 
 class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SigmoidOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Sigmoid operator");
-    AddOutput("Y", "Output of Sigmoid operator");
+    AddOutput("Out", "Output of Sigmoid operator");
     AddComment(R"DOC(
 Sigmoid Activation Operator
 
-$$y = \frac{1}{1 + e^{-x}}$$
+$$out = \frac{1}{1 + e^{-x}}$$
 
 )DOC");
   }
@@ -54,15 +53,14 @@ $$y = \frac{1}{1 + e^{-x}}$$
 
 class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LogSigmoidOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  LogSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of LogSigmoid operator");
-    AddOutput("Y", "Output of LogSigmoid operator");
+    AddOutput("Out", "Output of LogSigmoid operator");
     AddComment(R"DOC(
 Logsigmoid Activation Operator
 
-$$y = \log \frac{1}{1 + e^{-x}}$$
+$$out = \log \frac{1}{1 + e^{-x}}$$
 
 )DOC");
   }
@@ -70,14 +68,14 @@ $$y = \log \frac{1}{1 + e^{-x}}$$
 
 class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ExpOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ExpOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Exp operator");
-    AddOutput("Y", "Output of Exp operator");
+    AddOutput("Out", "Output of Exp operator");
     AddComment(R"DOC(
 Exp Activation Operator.
 
-$y = e^x$
+$out = e^x$
 
 )DOC");
   }
@@ -85,14 +83,14 @@ $y = e^x$
 
 class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Relu operator");
-    AddOutput("Y", "Output of Relu operator");
+    AddOutput("Out", "Output of Relu operator");
     AddComment(R"DOC(
 Relu Activation Operator.
 
-$y = \max(x, 0)$
+$out = \max(x, 0)$
 
 )DOC");
   }
@@ -100,16 +98,15 @@ $y = \max(x, 0)$
 
 class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LeakyReluOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  LeakyReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of LeakyRelu operator");
-    AddOutput("Y", "Output of LeakyRelu operator");
+    AddOutput("Out", "Output of LeakyRelu operator");
     AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
     AddComment(R"DOC(
 LeakyRelu Activation Operator.
 
-$y = \max(x, \alpha * x)$
+$out = \max(x, \alpha * x)$
 
 )DOC");
   }
@@ -117,17 +114,16 @@ $y = \max(x, \alpha * x)$
 
 class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftShrinkOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Softshrink operator");
-    AddOutput("Y", "Output of Softshrink operator");
+    AddOutput("Out", "Output of Softshrink operator");
     AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
     AddComment(R"DOC(
 Softshrink Activation Operator.
 
 $$
-y = \begin{cases} 
+out = \begin{cases} 
     x - \lambda, \text{if } x > \lambda \\
     x + \lambda, \text{if } x < -\lambda \\
     0,  \text{otherwise}
@@ -140,14 +136,14 @@ $$
 
 class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  TanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Tanh operator");
-    AddOutput("Y", "Output of Tanh operator");
+    AddOutput("Out", "Output of Tanh operator");
     AddComment(R"DOC(
 Tanh Activation Operator.
 
-$$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+$$out = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
 )DOC");
   }
@@ -155,15 +151,14 @@ $$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
 class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TanhShrinkOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  TanhShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of TanhShrink operator");
-    AddOutput("Y", "Output of TanhShrink operator");
+    AddOutput("Out", "Output of TanhShrink operator");
     AddComment(R"DOC(
 TanhShrink Activation Operator.
 
-$$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+$$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
 )DOC");
   }
@@ -171,18 +166,17 @@ $$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
 class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HardShrinkOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  HardShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of HardShrink operator");
-    AddOutput("Y", "Output of HardShrink operator");
+    AddOutput("Out", "Output of HardShrink operator");
     AddAttr<float>("threshold", "The value of threshold for HardShrink")
         .SetDefault(0.5f);
     AddComment(R"DOC(
 HardShrink Activation Operator.
 
 $$
-y = \begin{cases} 
+out = \begin{cases} 
     x, \text{if } x > \lambda \\
     x, \text{if } x < -\lambda \\
     0,  \text{otherwise}
@@ -195,14 +189,14 @@ $$
 
 class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SqrtOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SqrtOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Sqrt operator");
-    AddOutput("Y", "Output of Sqrt operator");
+    AddOutput("Out", "Output of Sqrt operator");
     AddComment(R"DOC(
 Sqrt Activation Operator.
 
-$y = \sqrt{x}$
+$out = \sqrt{x}$
 
 )DOC");
   }
@@ -210,14 +204,14 @@ $y = \sqrt{x}$
 
 class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AbsOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  AbsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Abs operator");
-    AddOutput("Y", "Output of Abs operator");
+    AddOutput("Out", "Output of Abs operator");
     AddComment(R"DOC(
 Abs Activation Operator.
 
-$y = |x|$
+$out = |x|$
 
 )DOC");
   }
@@ -225,14 +219,14 @@ $y = |x|$
 
 class CeilOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CeilOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  CeilOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Ceil operator");
-    AddOutput("Y", "Output of Ceil operator");
+    AddOutput("Out", "Output of Ceil operator");
     AddComment(R"DOC(
 Ceil Activation Operator.
 
-$y = ceil(x)$
+$out = ceil(x)$
 
 )DOC");
   }
@@ -240,14 +234,14 @@ $y = ceil(x)$
 
 class FloorOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FloorOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  FloorOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Floor operator");
-    AddOutput("Y", "Output of Floor operator");
+    AddOutput("Out", "Output of Floor operator");
     AddComment(R"DOC(
 Floor Activation Operator.
 
-$y = floor(x)$
+$out = floor(x)$
 
 )DOC");
   }
@@ -255,14 +249,14 @@ $y = floor(x)$
 
 class RoundOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RoundOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Round operator");
-    AddOutput("Y", "Output of Round operator");
+    AddOutput("Out", "Output of Round operator");
     AddComment(R"DOC(
 Round Activation Operator.
 
-$y = [x]$
+$out = [x]$
 
 )DOC");
   }
@@ -270,15 +264,14 @@ $y = [x]$
 
 class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReciprocalOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ReciprocalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Reciprocal operator");
-    AddOutput("Y", "Output of Reciprocal operator");
+    AddOutput("Out", "Output of Reciprocal operator");
     AddComment(R"DOC(
 Reciprocal Activation Operator.
 
-$$y = \frac{1}{x}$$
+$$out = \frac{1}{x}$$
 
 )DOC");
   }
@@ -286,14 +279,14 @@ $$y = \frac{1}{x}$$
 
 class LogOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LogOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  LogOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Log operator");
-    AddOutput("Y", "Output of Log operator");
+    AddOutput("Out", "Output of Log operator");
     AddComment(R"DOC(
 Log Activation Operator.
 
-$y = \ln(x)$
+$out = \ln(x)$
 
 Natural logarithm of x.
 
@@ -303,14 +296,14 @@ Natural logarithm of x.
 
 class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SquareOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SquareOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Square operator");
-    AddOutput("Y", "Output of Square operator");
+    AddOutput("Out", "Output of Square operator");
     AddComment(R"DOC(
 Square Activation Operator.
 
-$y = x^2$
+$out = x^2$
 
 )DOC");
   }
@@ -318,15 +311,14 @@ $y = x^2$
 
 class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftplusOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftplusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Softplus operator");
-    AddOutput("Y", "Output of Softplus operator");
+    AddOutput("Out", "Output of Softplus operator");
     AddComment(R"DOC(
 Softplus Activation Operator.
 
-$y = \ln(1 + e^{x})$
+$out = \ln(1 + e^{x})$
 
 )DOC");
   }
@@ -334,15 +326,14 @@ $y = \ln(1 + e^{x})$
 
 class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftsignOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftsignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Softsign operator");
-    AddOutput("Y", "Output of Softsign operator");
+    AddOutput("Out", "Output of Softsign operator");
     AddComment(R"DOC(
 Softsign Activation Operator.
 
-$$y = \frac{x}{1 + |x|}$$
+$$out = \frac{x}{1 + |x|}$$
 
 )DOC");
   }
@@ -350,10 +341,10 @@ $$y = \frac{x}{1 + |x|}$$
 
 class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  BReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of BRelu operator");
-    AddOutput("Y", "Output of BRelu operator");
+    AddOutput("Out", "Output of BRelu operator");
     AddAttr<float>("t_min", "The min marginal value of BRelu")
         .SetDefault(static_cast<float>(0));
     AddAttr<float>("t_max", "The max marginal value of BRelu")
@@ -361,7 +352,7 @@ class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 BRelu Activation Operator.
 
-$y = \max(\min(x, t_{min}), t_{max})$
+$out = \max(\min(x, t_{min}), t_{max})$
 
 )DOC");
   }
@@ -369,17 +360,16 @@ $y = \max(\min(x, t_{min}), t_{max})$
 
 class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftReluOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of SoftRelu operator");
-    AddOutput("Y", "Output of SoftRelu operator");
+    AddOutput("Out", "Output of SoftRelu operator");
     AddAttr<float>("threshold", "The threshold value of SoftRelu")
         .SetDefault(40.0f);
     AddComment(R"DOC(
 SoftRelu Activation Operator.
 
-$y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
+$out = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
 
 )DOC");
   }
@@ -387,10 +377,10 @@ $y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
 
 class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ELUOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ELUOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of ELU operator");
-    AddOutput("Y", "Output of ELU operator");
+    AddOutput("Out", "Output of ELU operator");
     AddAttr<float>("alpha", "The alpha value of ELU").SetDefault(1.0f);
     AddComment(R"DOC(
 ELU Activation Operator.
@@ -398,7 +388,7 @@ ELU Activation Operator.
 Applies the following element-wise computation on the input according to
 https://arxiv.org/abs/1511.07289.
 
-$y = \max(0, x) + \min(0, \alpha * (e^x - 1))$
+$out = \max(0, x) + \min(0, \alpha * (e^x - 1))$
 
 )DOC");
   }
@@ -406,16 +396,16 @@ $y = \max(0, x) + \min(0, \alpha * (e^x - 1))$
 
 class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Relu6OpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  Relu6OpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Relu6 operator");
-    AddOutput("Y", "Output of Relu6 operator");
+    AddOutput("Out", "Output of Relu6 operator");
     AddAttr<float>("threshold", "The threshold value of Relu6")
         .SetDefault(6.0f);
     AddComment(R"DOC(
 Relu6 Activation Operator.
 
-$y = \min(\max(0, x), 6)$
+$out = \min(\max(0, x), 6)$
 
 )DOC");
   }
@@ -423,15 +413,15 @@ $y = \min(\max(0, x), 6)$
 
 class PowOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PowOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  PowOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Pow operator");
-    AddOutput("Y", "Output of Pow operator");
+    AddOutput("Out", "Output of Pow operator");
     AddAttr<float>("factor", "The exponential factor of Pow").SetDefault(1.0f);
     AddComment(R"DOC(
 Pow Activation Operator.
 
-$y = x^{factor}$
+$out = x^{factor}$
 
 )DOC");
   }
@@ -439,10 +429,10 @@ $y = x^{factor}$
 
 class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  STanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  STanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of STanh operator");
-    AddOutput("Y", "Output of STanh operator");
+    AddOutput("Out", "Output of STanh operator");
     AddAttr<float>("scale_a", "The scale parameter of a for the input")
         .SetDefault(2.0f / 3.0f);
     AddAttr<float>("scale_b", "The scale parameter of b for the input")
@@ -450,7 +440,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 STanh Activation Operator.
 
-$$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
+$$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
 
 )DOC");
   }
@@ -458,18 +448,17 @@ $$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
 
 class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ThresholdedReluOpMaker(framework::OpProto *proto,
-                         framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ThresholdedReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of ThresholdedRelu operator");
-    AddOutput("Y", "Output of ThresholdedRelu operator");
+    AddOutput("Out", "Output of ThresholdedRelu operator");
     AddAttr<float>("threshold", "The threshold location of activation")
         .SetDefault(1.0f);
     AddComment(R"DOC(
 ThresholdedRelu Activation Operator.
 
 $$
-y = \begin{cases} 
+out = \begin{cases} 
     x, \text{if } x > threshold \\
     0,  \text{otherwise}
     \end{cases}
@@ -481,11 +470,10 @@ $$
 
 class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HardSigmoidOpMaker(framework::OpProto *proto,
-                     framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  HardSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of HardSigmoid operator");
-    AddOutput("Y", "Output of HardSigmoid operator");
+    AddOutput("Out", "Output of HardSigmoid operator");
     AddAttr<float>("slope", "Slope for linear approximation of sigmoid")
         .SetDefault(0.2f);
     AddAttr<float>("offset", "Offset for linear approximation of sigmoid")
@@ -496,7 +484,7 @@ HardSigmoid Activation Operator.
 Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), 
 which is much faster than sigmoid.
 
-$y = \max(0, \min(1, slope * x + shift))$
+$out = \max(0, \min(1, slope * x + shift))$
 
 The slope should be positive. The offset can be either positive or negative.
 The default slope and shift are set according to the above reference.
@@ -508,15 +496,15 @@ It is recommended to use the defaults for this activation.
 
 class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SwishOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SwishOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Swish operator");
-    AddOutput("Y", "Output of Swish operator");
+    AddOutput("Out", "Output of Swish operator");
     AddAttr<float>("beta", "Constant beta of swish operator").SetDefault(1.0f);
     AddComment(R"DOC(
 Swish Activation Operator.
 
-$$y = \frac{x}{1 + e^{- \beta x}}$$
+$$out = \frac{x}{1 + e^{- \beta x}}$$
 
 )DOC");
   }
diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu
index 856d3fc35dafe6b22c25c55dfda2dc4973072615..b9ccdf639cf4a9ea80d530e550c16089e50c44e0 100644
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/activation_op.h"
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 75eefca8b8c7ba8831a2f90c83718d00b83fba30..0885f7c570b9b52dc51597347295734fd689da8d 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -27,11 +27,11 @@ class ActivationKernel
 
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Output<framework::Tensor>("Y");
-    Y->mutable_data<T>(context.GetPlace());
+    auto* Out = context.Output<framework::Tensor>("Out");
+    Out->mutable_data<T>(context.GetPlace());
 
     auto x = framework::EigenVector<T>::Flatten(*X);
-    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto out = framework::EigenVector<T>::Flatten(*Out);
     auto* place =
         context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
@@ -40,7 +40,7 @@ class ActivationKernel
     for (auto& attr : attrs) {
       *attr.second = context.Attr<float>(attr.first);
     }
-    functor(*place, x, y);
+    functor(*place, x, out);
   }
 };
 
@@ -51,14 +51,15 @@ class ActivationGradKernel
   using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Input<framework::Tensor>("Y");
-    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+    auto* Out = context.Input<framework::Tensor>("Out");
+    auto* dOut =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
     dX->mutable_data<T>(context.GetPlace());
 
-    auto dy = framework::EigenVector<T>::Flatten(*dY);
+    auto dout = framework::EigenVector<T>::Flatten(*dOut);
     auto x = framework::EigenVector<T>::Flatten(*X);
-    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto out = framework::EigenVector<T>::Flatten(*Out);
     auto dx = framework::EigenVector<T>::Flatten(*dX);
     auto* place =
         context.template device_context<DeviceContext>().eigen_device();
@@ -67,7 +68,7 @@ class ActivationGradKernel
     for (auto& attr : attrs) {
       *attr.second = context.Attr<float>(attr.first);
     }
-    functor(*place, x, y, dy, dx);
+    functor(*place, x, out, dout, dx);
   }
 };
 
@@ -83,17 +84,18 @@ struct BaseActivationFunctor {
 // sigmoid(x) = 1 / (1 + exp(-x))
 template <typename T>
 struct SigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
   }
 };
 
 template <typename T>
 struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * y * (static_cast<T>(1) - y);
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out * (static_cast<T>(1) - out);
   }
 };
 
@@ -101,7 +103,7 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
 // For numerical stability, we can use the log-sum-exp trick:
 // https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
 // We can rewrite the above equation as:
-// y = -log( exp(0) + exp(-x)) [since exp(0) = 1]
+// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
 //   = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
 //   = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
 //           max(-x, 0)))
@@ -112,10 +114,10 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
 // + exp(-x - max(-x, 0))))
 template <typename T>
 struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
     auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
-    y.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
+    out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
   }
 };
 
@@ -124,62 +126,66 @@ struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
 // exp(-x - max(-x, 0)))
 template <typename T>
 struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
     dx.device(d) =
-        dy * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
+        dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
   }
 };
 
 // exp(x) = e^x
 template <typename T>
 struct ExpFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.exp();
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.exp();
   }
 };
 
 template <typename T>
 struct ExpGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * y;
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out;
   }
 };
 
 // relu(x) = max(x, 0)
 template <typename T>
 struct ReluFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.cwiseMax(static_cast<T>(0));
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(0));
   }
 };
 
 template <typename T>
 struct ReluGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>();
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>();
   }
 };
 
 // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
 struct TanhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.tanh();
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.tanh();
   }
 };
 
 template <typename T>
 struct TanhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * (static_cast<T>(1) - y * y);
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) - out * out);
   }
 };
 
@@ -187,17 +193,18 @@ struct TanhGradFunctor : public BaseActivationFunctor<T> {
 // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
 struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x - x.tanh();
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x - x.tanh();
   }
 };
 
 template <typename T>
 struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * (x.tanh() * x.tanh());
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (x.tanh() * x.tanh());
   }
 };
 
@@ -210,11 +217,11 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"threshold", &threshold}};
   }
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
     auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>().eval();
     auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
-    y.device(d) = x * (temp1 + temp2);
+    out.device(d) = x * (temp1 + temp2);
   }
 };
 
@@ -226,11 +233,12 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
     return {{"threshold", &threshold}};
   }
 
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>().eval();
     auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
-    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 };
 
@@ -243,12 +251,12 @@ struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
     return {{"lambda", &lambda}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
     auto lambdaT = static_cast<T>(lambda);
     auto temp1 = (x > lambdaT).template cast<T>().eval();
     auto temp2 = (x < -lambdaT).template cast<T>().eval();
-    y.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
+    out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
   }
 };
 
@@ -258,46 +266,49 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"lambda", &lambda}};
   }
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto lambdaT = static_cast<T>(lambda);
     auto temp1 = (x > lambdaT).template cast<T>().eval();
     auto temp2 = (x < -lambdaT).template cast<T>().eval();
-    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 };
 
 // sqrt(x) = x^(1/2)
 template <typename T>
 struct SqrtFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.sqrt();
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.sqrt();
   }
 };
 
 template <typename T>
 struct SqrtGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    const Y y_conj = Eigen::numext::conj(y);
-    dx.device(d) = static_cast<T>(0.5) * dy / y_conj;
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    const Out out_conj = Eigen::numext::conj(out);
+    dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
   }
 };
 
 // ceil(x) = ceiling(x)
 template <typename T>
 struct CeilFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.ceil();
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.ceil();
   }
 };
 
 template <typename T>
 struct ZeroGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = static_cast<T>(0) / x;
   }
 };
@@ -305,86 +316,90 @@ struct ZeroGradFunctor : public BaseActivationFunctor<T> {
 // floor(x) = flooring(x)
 template <typename T>
 struct FloorFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.ceil();
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.ceil();
   }
 };
 
 // round(x) = [x]
 template <typename T>
 struct RoundFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.round();
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.round();
   }
 };
 
 // abs(x) = |x|
 template <typename T>
 struct AbsFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.abs();
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.abs();
   }
 };
 
 template <typename T>
 struct AbsGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * x.sign();
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.sign();
   }
 };
 
 // reciprocal(x) = 1 / x
 template <typename T>
 struct ReciprocalFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = static_cast<T>(1) / x;
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = static_cast<T>(1) / x;
   }
 };
 
 template <typename T>
 struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * static_cast<T>(-1) * y * y;
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(-1) * out * out;
   }
 };
 
 // log(x) = natural logarithm of x
 template <typename T>
 struct LogFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.log();
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.log();
   }
 };
 
 template <typename T>
 struct LogGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * (static_cast<T>(1) / x);
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) / x);
   }
 };
 
 // square(x) = x^2
 template <typename T>
 struct SquareFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.square();
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.square();
   }
 };
 
 template <typename T>
 struct SquareGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * static_cast<T>(2) * x;
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(2) * x;
   }
 };
 
@@ -399,9 +414,9 @@ struct BReluFunctor : public BaseActivationFunctor<T> {
     return {{"t_min", &t_min}, {"t_max", &t_max}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) =
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
         x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
   }
 };
@@ -413,9 +428,10 @@ struct BReluGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"t_min", &t_min}, {"t_max", &t_max}};
   }
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy *
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
                    ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
                        .template cast<T>();
   }
@@ -430,9 +446,9 @@ struct Relu6Functor : public BaseActivationFunctor<T> {
     return {{"threshold", &threshold}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) =
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
         x.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(threshold));
   }
 };
@@ -443,9 +459,10 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"threshold", &threshold}};
   }
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy *
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
                    ((x > static_cast<T>(0)) * (x < static_cast<T>(threshold)))
                        .template cast<T>();
   }
@@ -458,10 +475,10 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
 // Then: softplus(x) = max(x, 0) + log(exp(-max(x, 0)) + exp(x - max(x, 0)))
 template <typename T>
 struct SoftplusFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) {
     auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
-    y.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log());
+    out.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log());
   }
 };
 
@@ -471,19 +488,21 @@ struct SoftplusFunctor : public BaseActivationFunctor<T> {
 // exp(x - max(x, 0)))
 template <typename T>
 struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
     auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
-    dx.device(d) = dy * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp()));
+    dx.device(d) =
+        dout * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp()));
   }
 };
 
 // softsign(x) = x / (1 + |x|)
 template <typename T>
 struct SoftsignFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
-    y.device(d) = x / (static_cast<T>(1) + x.abs());
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) {
+    out.device(d) = x / (static_cast<T>(1) + x.abs());
   }
 };
 
@@ -491,10 +510,11 @@ struct SoftsignFunctor : public BaseActivationFunctor<T> {
 // Taken from https://en.wikipedia.org/wiki/Activation_function
 template <typename T>
 struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
     dx.device(d) =
-        dy * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
+        dout * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
   }
 };
 
@@ -505,11 +525,11 @@ struct SoftReluFunctor : public BaseActivationFunctor<T> {
     return {{"threshold", &threshold}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
     auto tmp = static_cast<T>(threshold);
     auto temp = x.cwiseMax(-tmp).cwiseMin(tmp);
-    y.device(d) = (static_cast<T>(1) + temp.exp()).log();
+    out.device(d) = (static_cast<T>(1) + temp.exp()).log();
   }
 };
 
@@ -519,11 +539,12 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"threshold", &threshold}};
   }
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto tmp = static_cast<T>(threshold);
     auto temp = ((x > -tmp) * (x < tmp)).template cast<T>().eval();
-    dx.device(d) = dy * (static_cast<T>(1) - (-y).exp()) * temp;
+    dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
   }
 };
 
@@ -534,9 +555,9 @@ struct LeakyReluFunctor : public BaseActivationFunctor<T> {
     return {{"alpha", &alpha}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
   }
 };
 
@@ -546,12 +567,13 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"alpha", &alpha}};
   }
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp1 = static_cast<T>(alpha) *
                  (x < static_cast<T>(0)).template cast<T>().eval();
     auto temp2 = (x >= static_cast<T>(0)).template cast<T>().eval();
-    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 };
 
@@ -562,11 +584,11 @@ struct ELUFunctor : public BaseActivationFunctor<T> {
     return {{"alpha", &alpha}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.cwiseMax(static_cast<T>(0)) +
-                  (static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)))
-                      .cwiseMin(static_cast<T>(0));
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(0)) +
+                    (static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)))
+                        .cwiseMin(static_cast<T>(0));
   }
 };
 
@@ -576,10 +598,11 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"alpha", &alpha}};
   }
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>() +
-                   dy * (y + static_cast<T>(alpha)) *
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>() +
+                   dout * (out + static_cast<T>(alpha)) *
                        (x < static_cast<T>(0)).template cast<T>();
   }
 };
@@ -591,9 +614,9 @@ struct PowFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"factor", &factor}};
   }
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.pow(static_cast<T>(factor));
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.pow(static_cast<T>(factor));
   }
 };
 
@@ -603,9 +626,10 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"factor", &factor}};
   }
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * static_cast<T>(factor) *
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(factor) *
                    x.pow(static_cast<T>(factor - static_cast<T>(1)));
   }
 };
@@ -618,9 +642,9 @@ struct STanhFunctor : public BaseActivationFunctor<T> {
     return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) =
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
         static_cast<T>(scale_b) * (static_cast<T>(scale_a) * x).tanh();
   }
 };
@@ -633,12 +657,13 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
     return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
   }
 
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto a = static_cast<T>(scale_a);
     auto b = static_cast<T>(scale_b);
     auto temp = (a * x).tanh() * (a * x).tanh();
-    dx.device(d) = dy * a * b * (static_cast<T>(1) - temp);
+    dx.device(d) = dout * a * b * (static_cast<T>(1) - temp);
   }
 };
 
@@ -649,10 +674,10 @@ struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
     return {{"threshold", &threshold}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
     auto th = static_cast<T>(threshold);
-    y.device(d) = (x > th).template cast<T>() * x;
+    out.device(d) = (x > th).template cast<T>() * x;
   }
 };
 
@@ -663,10 +688,11 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
     return {{"threshold", &threshold}};
   }
 
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto th = static_cast<T>(threshold);
-    dx.device(d) = dy * (x > th).template cast<T>();
+    dx.device(d) = dout * (x > th).template cast<T>();
   }
 };
 
@@ -678,10 +704,11 @@ struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
     return {{"slope", &slope}, {"offset", &offset}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
     auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
-    y.device(d) = temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
+    out.device(d) =
+        temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
   }
 };
 
@@ -693,12 +720,13 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
     return {{"slope", &slope}, {"offset", &offset}};
   }
 
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) =
-        dy *
-        ((y > static_cast<T>(0)) * (y < static_cast<T>(1))).template cast<T>() *
-        static_cast<T>(slope);
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
+                   ((out > static_cast<T>(0)) * (out < static_cast<T>(1)))
+                       .template cast<T>() *
+                   static_cast<T>(slope);
   }
 };
 
@@ -709,9 +737,9 @@ struct SwishFunctor : public BaseActivationFunctor<T> {
     return {{"beta", &beta}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x / (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x / (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
   }
 };
 
@@ -722,12 +750,13 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
     return {{"beta", &beta}};
   }
 
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp1 = static_cast<T>(1) /
                  (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
-    auto temp2 = temp1 * (static_cast<T>(1) - (beta * y));
-    dx.device(d) = dy * ((beta * y) + temp2);
+    auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
+    dx.device(d) = dout * ((beta * out) + temp2);
   }
 };
 
diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc
index 507811e7b59b9426c599570ead9b42f8d02380fd..d8a9491c8247ac463e01606dac248780d5284236 100644
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -59,8 +59,7 @@ class AdadeltaOp : public framework::OperatorWithKernel {
 
 class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdadeltaOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  AdadeltaOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
diff --git a/paddle/operators/adadelta_op.cu b/paddle/operators/adadelta_op.cu
index eee2d0a2f55f877bc5c87c72bca07bfd9485e517..91294a0d5d148a43bb95ab83ae8176b475fde9de 100644
--- a/paddle/operators/adadelta_op.cu
+++ b/paddle/operators/adadelta_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/adadelta_op.h"
diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc
index 5d007163161cd4bf4a9fd46eda57f7984c6a414f..052c793a01907abdc7784d1290f43543ae81bdb1 100644
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -59,8 +59,7 @@ class AdagradOp : public framework::OperatorWithKernel {
 
 class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdagradOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  AdagradOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu
index 585b2d92894af65b8ed15a596f0377fdcf564cfa..75bc7affd6c78beb783e01682b4538f2c259df26 100644
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/adagrad_op.h"
diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc
index cf6ef6dd53979b23de125014b8d5150d8ce4c053..03527de936bf736d572fb0140033bde4db990981 100644
--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@@ -73,7 +73,7 @@ class AdamOp : public framework::OperatorWithKernel {
 
 class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdamOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  AdamOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
diff --git a/paddle/operators/adam_op.cu b/paddle/operators/adam_op.cu
index c135b3737899a1ae92041b4759698ddc30c20e12..94f840c188942a900858429bc621c3a18d5900ad 100644
--- a/paddle/operators/adam_op.cu
+++ b/paddle/operators/adam_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/adam_op.h"
diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h
index 45157842a6f92348909498f83d304d53b36c7d47..c4e2c8bb88ec9c74bd782570c10fb217178c8e48 100644
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -13,59 +13,113 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
+#include <math.h>  // for sqrt in CPU and CUDA
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
 
+template <typename T>
+struct AdamFunctor {
+  T beta1_;
+  T beta2_;
+  T epsilon_;
+
+  const T* beta1_pow_;
+  const T* beta2_pow_;
+  const T* moment1_;
+  T* moment1_out_;
+  const T* moment2_;
+  T* moment2_out_;
+  const T* lr_;
+  const T* grad_;
+  const T* param_;
+  T* param_out_;
+
+  AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
+              const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2,
+              T* mom2_out, const T* lr, const T* grad, const T* param,
+              T* param_out)
+      : beta1_(beta1),
+        beta2_(beta2),
+        epsilon_(epsilon),
+        beta1_pow_(beta1_pow),
+        beta2_pow_(beta2_pow),
+        moment1_(mom1),
+        moment1_out_(mom1_out),
+        moment2_(mom2),
+        moment2_out_(mom2_out),
+        lr_(lr),
+        grad_(grad),
+        param_(param),
+        param_out_(param_out) {}
+
+  inline HOSTDEVICE void operator()(size_t i) const {
+    // Merge all memory access together.
+    T g = grad_[i];
+    T mom1 = moment1_[i];
+    T mom2 = moment2_[i];
+    T lr = *lr_;
+    T beta1_pow = *beta1_pow_;
+    T beta2_pow = *beta2_pow_;
+    T p = param_[i];
+
+    // Calculation
+    lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
+    mom1 = beta1_ * mom1 + (1 - beta1_) * g;
+    mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
+    p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
+
+    // Write back to global memory
+    moment1_out_[i] = mom1;
+    moment2_out_[i] = mom2;
+    param_out_[i] = p;
+  }
+};
+
 template <typename DeviceContext, typename T>
 class AdamOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto moment1_out_tensor = ctx.Output<framework::Tensor>("Moment1Out");
-    auto moment2_out_tensor = ctx.Output<framework::Tensor>("Moment2Out");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    moment1_out_tensor->mutable_data<T>(ctx.GetPlace());
-    moment2_out_tensor->mutable_data<T>(ctx.GetPlace());
+    using paddle::framework::LoDTensor;
+    using paddle::operators::detail::Ref;
 
     T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
     T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
     T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    auto& param = Ref(ctx.Input<LoDTensor>("Param"), "Must set Param");
+    auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
+    auto& mom1 = Ref(ctx.Input<LoDTensor>("Moment1"), "Must set Moment1");
+    auto& mom2 = Ref(ctx.Input<LoDTensor>("Moment2"), "Must set Moment2");
+    auto& lr =
+        Ref(ctx.Input<LoDTensor>("LearningRate"), "Must set LearningRate");
+
+    auto& beta1_pow =
+        Ref(ctx.Input<LoDTensor>("Beta1Pow"), "Must set Beta1Pow");
+    auto& beta2_pow =
+        Ref(ctx.Input<LoDTensor>("Beta2Pow"), "Must set Beta2Pow");
+
+    auto& param_out =
+        Ref(ctx.Output<LoDTensor>("ParamOut"), "Must set ParamOut");
+    auto& mom1_out =
+        Ref(ctx.Output<LoDTensor>("Moment1Out"), "Must set Moment1Out");
+    auto& mom2_out =
+        Ref(ctx.Output<LoDTensor>("Moment2Out"), "Must set Moment1Out");
 
-    auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
-    auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
-    auto moment1 = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Moment1"));
-    auto moment2 = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Moment2"));
-    auto lr = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("LearningRate"));
-    auto beta1_pow = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Beta1Pow"));
-    auto beta2_pow = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Beta2Pow"));
-    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-    auto moment1_out = framework::EigenVector<T>::Flatten(*moment1_out_tensor);
-    auto moment2_out = framework::EigenVector<T>::Flatten(*moment2_out_tensor);
-    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-
-    moment1_out.device(*place) = beta1 * moment1 + (1 - beta1) * grad;
-    moment2_out.device(*place) = beta2 * moment2 + (1 - beta2) * grad.square();
-
-    // All of these are tensors of 1 element
-    auto lr_t = lr * (1 - beta2_pow).sqrt() / (1 - beta1_pow);
-    // Eigen does not support automatic broadcast
-    // Get dimensions of moment vector to broadcast lr_t
-    Eigen::DSizes<int, 1> m_dsize(moment1_out_tensor->numel());
-    param_out.device(*place) =
-        param -
-        lr_t.broadcast(m_dsize) *
-            (moment1_out / (moment2_out.sqrt() + epsilon));
+    AdamFunctor<T> functor(beta1, beta2, epsilon, beta1_pow.template data<T>(),
+                           beta2_pow.template data<T>(),
+                           mom1.template data<T>(),
+                           mom1_out.template mutable_data<T>(ctx.GetPlace()),
+                           mom2.template data<T>(),
+                           mom2_out.template mutable_data<T>(ctx.GetPlace()),
+                           lr.template data<T>(), grad.template data<T>(),
+                           param.template data<T>(),
+                           param_out.template mutable_data<T>(ctx.GetPlace()));
+    platform::ForRange<DeviceContext> for_range(
+        static_cast<const DeviceContext&>(ctx.device_context()), param.numel());
+    for_range(functor);
   }
 };
 
diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc
index 49ce497bb710de24b198fb4b5f56ff6d277c6f52..3b0b71418477ea128dbb31a8d7cd44cf6bf023a1 100644
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@@ -67,7 +67,7 @@ class AdamaxOp : public framework::OperatorWithKernel {
 
 class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdamaxOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  AdamaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
diff --git a/paddle/operators/adamax_op.cu b/paddle/operators/adamax_op.cu
index 2d143905c4819dbf5f94391bdcf093971849e7a3..8f87bb28671018a184f25a014f9bdb7615f3040c 100644
--- a/paddle/operators/adamax_op.cu
+++ b/paddle/operators/adamax_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/adamax_op.h"
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
index 1f2b4fdb4b4a99d5baf5de1cc226dc196ab4eb2e..e0eef5d9f93d70930ee82d663de9610cc0176e33 100644
--- a/paddle/operators/array_operator.h
+++ b/paddle/operators/array_operator.h
@@ -1,20 +1,21 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -27,11 +28,16 @@ class ArrayOp : public framework::OperatorBase {
 
  protected:
   size_t GetOffset(const framework::Scope &scope,
-                   const platform::DeviceContext &dev_ctx) const {
+                   const platform::Place &place) const {
     auto *i = scope.FindVar(Input("I"));
     PADDLE_ENFORCE(i != nullptr, "I must be set");
     auto &i_tensor = i->Get<framework::LoDTensor>();
     PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
     size_t offset;
     if (platform::is_gpu_place(i_tensor.place())) {
       // FIXME: Avoid copy from GPU to CPU
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
index faeba7f3ed26d05de16775a1de4d42f802111207..49366fee8df5a44a97b7b4e87cbf0b7c813a414a 100644
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -1,21 +1,23 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <numeric>
+
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -30,7 +32,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
                      const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
     auto &rank_table =
         scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
@@ -103,6 +105,11 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
           continue;
         }
         auto slice = out->Slice(out_offset, out_offset + len);
+
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(place);
+
         framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place,
                             dev_ctx, &slice);
         out_offset += len;
@@ -114,8 +121,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
 
 class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ArrayToLoDTensorOpProtoMaker(framework::OpProto *proto,
-                               framework::OpAttrChecker *op_checker)
+  ArrayToLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(std::vector<LodTensor>) A vector of tensors that is going to "
@@ -150,14 +156,14 @@ class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("lod_tensor_to_array");
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetInput("RankTable", Input("RankTable"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc
index 0a37f18729a93b15623c0a17e3689e518c38b844..7d77be3be1034bb38f6c92c181aa525214073eec 100644
--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
@@ -1,20 +1,21 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/var_type.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -71,7 +72,7 @@ class AssignOp : public framework::OperatorBase {
            const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
     if (x == nullptr) {
       return;
@@ -80,14 +81,17 @@ class AssignOp : public framework::OperatorBase {
     PADDLE_ENFORCE(
         out != nullptr,
         "The Output(Out) should not be null if the Input(X) is set.");
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
     framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
   }
 };
 
 class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AssignOpProtoMaker(framework::OpProto *proto,
-                     framework::OpAttrChecker *op_checker)
+  AssignOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
@@ -109,8 +113,8 @@ class AssignInferShape : public framework::InferShapeBase {
   void operator()(framework::InferShapeContext *context) const override {
     if (context->HasInput("X")) {
       auto type = context->GetInputsVarType("X")[0];
-      if (type == framework::VarDesc_VarType_SELECTED_ROWS ||
-          type == framework::VarDesc_VarType_LOD_TENSOR) {
+      if (type == framework::proto::VarDesc_VarType_SELECTED_ROWS ||
+          type == framework::proto::VarDesc_VarType_LOD_TENSOR) {
         context->SetOutputDim("Out", context->GetInputDim("X"));
       }
     }
@@ -122,12 +126,12 @@ class AssignGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
     op->SetType("assign");
     op->SetInput("X", OutputGrad("Out"));
     op->SetOutput("Out", InputGrad("X"));
-    return std::unique_ptr<framework::OpDescBind>(op);
+    return std::unique_ptr<framework::OpDesc>(op);
   }
 };
 
diff --git a/paddle/operators/auc_op.cc b/paddle/operators/auc_op.cc
index 6c3f67ec32fb1b942241997e87a1e9c4752e707d..c16bc11931e6733d567107913521eafc34a30066 100644
--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
@@ -39,7 +39,7 @@ class AucOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
@@ -49,7 +49,7 @@ class AucOp : public framework::OperatorWithKernel {
 
 class AucOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AucOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  AucOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Out",
              "A floating point 2D tensor, values are in the range [0, 1]."
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index 94a972b7ab56f41f8b6a203b6bf0330a69f84e54..98db28ddee7c6cdb37fe7732649d4fc38de7b873 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/batch_norm_op.h"
+#include "paddle/framework/data_layout.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
 
 template <typename T>
 using EigenArrayMap =
@@ -48,10 +50,6 @@ class BatchNormOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), "");
     PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), "");
 
-    const float epsilon = ctx->Attrs().Get<float>("epsilon");
-    PADDLE_ENFORCE_GE(epsilon, 0.0, "epsilon should be larger than 0");
-    PADDLE_ENFORCE_LE(epsilon, 0.001, "epsilon should not be too large");
-
     // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
     PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
                       "Mean and MeanOut should share the same memory");
@@ -60,15 +58,15 @@ class BatchNormOp : public framework::OperatorWithKernel {
                       "Variance and VarianceOut should share the same memory");
 
     const auto x_dims = ctx->GetInputDim("X");
-    const TensorFormat tensor_format =
-        StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
+    const DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
 
     PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
                    "Input X must have 2 to 5 dimensions.");
 
     const int C =
-        (tensor_format == TensorFormat::NCHW ? x_dims[1]
-                                             : x_dims[x_dims.size() - 1]);
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
 
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
@@ -85,13 +83,17 @@ class BatchNormOp : public framework::OperatorWithKernel {
 
 class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BatchNormOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  BatchNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddAttr<bool>("is_test", "").SetDefault(false);
     AddAttr<float>("momentum", "").SetDefault(0.9);
-    AddAttr<float>("epsilon", "").SetDefault(1e-5);
-    AddAttr<std::string>("tensor_format", "").SetDefault("NCHW");
+    AddAttr<float>("epsilon", "")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
+                         "'epsilon' should be between 0.0 and 0.001.");
+        });
+    AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
     AddInput("X", "The input tensor");
     AddInput("Scale",
              "Scale is a 1-dimensional tensor of size C "
@@ -142,9 +144,9 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
     const float epsilon = ctx.Attr<float>("epsilon");
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
-    const std::string tensor_format_str =
-        ctx.Attr<std::string>("tensor_format");
-    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
@@ -152,8 +154,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
                    "The Input dim size should be between 2 and 5");
     const int N = x_dims[0];
     const int C =
-        (tensor_format == TensorFormat::NCHW ? x_dims[1]
-                                             : x_dims[x_dims.size() - 1]);
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
     const int sample_size = x->numel() / N / C;
 
     auto *y = ctx.Output<Tensor>("Y");
@@ -178,8 +180,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
       saved_mean_e.setZero();
       saved_variance_e.setZero();
 
-      switch (tensor_format) {
-        case TensorFormat::NCHW: {
+      switch (data_layout) {
+        case DataLayout::kNCHW: {
           ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
           for (int nc = 0; nc < N * C; ++nc) {
             saved_mean_e(nc % C) += x_arr.col(nc).sum();
@@ -192,7 +194,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
           saved_variance_e /= N * sample_size;
           break;
         }
-        case TensorFormat::NHWC: {
+        case DataLayout::kNHWC: {
           ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
           for (int i = 0; i < N * sample_size; ++i) {
             saved_mean_e += x_arr.col(i);
@@ -206,7 +208,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
           break;
         }
         default:
-          PADDLE_THROW("Unknown storage order: %s", tensor_format_str);
+          PADDLE_THROW("Unknown storage order: %s", data_layout_str);
       }
 
       EigenVectorArrayMap<T> running_mean_arr(
@@ -248,8 +250,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
     Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
         bias_arr - mean_arr * inv_std * scale_arr;
 
-    switch (tensor_format) {
-      case TensorFormat::NCHW: {
+    switch (data_layout) {
+      case DataLayout::kNCHW: {
         EigenArrayMap<T> y_arr(y->mutable_data<T>(ctx.GetPlace()), sample_size,
                                N * C);
         ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
@@ -258,7 +260,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
         }
         break;
       }
-      case TensorFormat::NHWC: {
+      case DataLayout::kNHWC: {
         EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C,
                          N * sample_size) =
             (ConstEigenArrayMap<T>(x->data<T>(), C, N * sample_size).colwise() *
@@ -268,7 +270,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
         break;
       }
       default:
-        PADDLE_THROW("Unknown storage order: %d", tensor_format);
+        PADDLE_THROW("Unknown storage order: %d", data_layout);
     }
   }
 };
@@ -291,11 +293,11 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), "");
 
     const auto x_dims = ctx->GetInputDim("X");
-    const TensorFormat tensor_format =
-        StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
+    const DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
     const int C =
-        (tensor_format == TensorFormat::NCHW ? x_dims[1]
-                                             : x_dims[x_dims.size() - 1]);
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
 
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
     ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
@@ -303,7 +305,7 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext &ctx) const override {
     const auto *var = ctx.InputVar(framework::GradVarName("Y"));
     if (var == nullptr) {
@@ -334,9 +336,9 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
     const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
     // SavedVariance have been reverted in forward operator
     const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
-    const std::string tensor_format_str =
-        ctx.Attr<std::string>("tensor_format");
-    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
 
     // Get the size for each dimension.
     // NCHW [batch_size, in_channels, in_height, in_width]
@@ -345,8 +347,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
                    "The Input dim size should be between 2 and 5");
     const int N = x_dims[0];
     const int C =
-        (tensor_format == TensorFormat::NCHW ? x_dims[1]
-                                             : x_dims[x_dims.size() - 1]);
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
     const int sample_size = x->numel() / N / C;
 
     ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
@@ -377,8 +379,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
 
     const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);
 
-    switch (tensor_format) {
-      case TensorFormat::NCHW: {
+    switch (data_layout) {
+      case DataLayout::kNCHW: {
         ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
         ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
         EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()),
@@ -401,7 +403,7 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
         }
         break;
       }
-      case TensorFormat::NHWC: {
+      case DataLayout::kNHWC: {
         ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
         ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
         EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C,
@@ -426,7 +428,7 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
         break;
       }
       default:
-        PADDLE_THROW("Unknown storage order: %s", tensor_format_str);
+        PADDLE_THROW("Unknown storage order: %s", data_layout_str);
     }
   }
 };
diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc
index c7adc3d80ed25d129cec41a0fd3d22fd42aba363..3d17725ab47682355b2093782848849857f9bf59 100644
--- a/paddle/operators/batch_norm_op.cu.cc
+++ b/paddle/operators/batch_norm_op.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/batch_norm_op.h"
+#include "paddle/framework/data_layout.h"
 
 #include <cfloat>
 #include "paddle/operators/math/math_function.h"
@@ -22,12 +23,12 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 
-void ExtractNCWHD(const framework::DDim &dims,
-                  const TensorFormat &tensor_format, int *N, int *C, int *H,
-                  int *W, int *D) {
+void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout,
+                  int *N, int *C, int *H, int *W, int *D) {
   *N = dims[0];
   if (dims.size() == 2) {
     *C = dims[1];
@@ -35,13 +36,13 @@ void ExtractNCWHD(const framework::DDim &dims,
     *W = 1;
     *D = 1;
   } else {
-    *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
-    *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
     *W = dims.size() > 3
-             ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
+             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
              : 1;
     *D = dims.size() > 4
-             ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
+             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
              : 1;
   }
 }
@@ -52,13 +53,13 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
     double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
-    const std::string tensor_format_str =
-        ctx.Attr<std::string>("tensor_format");
-    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
 
     // Get the size for each dimension.
     // NCHW [batch_size, in_channels, in_height, in_width]
@@ -67,7 +68,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
                    "The Input dim size should be between 2 and 5");
     int N, C, H, W, D;
-    ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
     // ------------------- cudnn descriptors ---------------------
     cudnnTensorDescriptor_t data_desc_;
@@ -93,7 +94,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     VLOG(1) << "Setting descriptors.";
     std::vector<int> dims;
     std::vector<int> strides;
-    if (tensor_format == TensorFormat::NCHW) {
+    if (data_layout == DataLayout::kNCHW) {
       dims = {N, C, H, W, D};
       strides = {C * H * W * D, H * W * D, W * D, D, 1};
     } else {
@@ -178,11 +179,11 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
     double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const std::string tensor_format_str =
-        ctx.Attr<std::string>("tensor_format");
-    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
     const auto *x = ctx.Input<Tensor>("X");
     const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
     const auto *scale = ctx.Input<Tensor>("Scale");
@@ -192,7 +193,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
                    "The Input dim size should be between 2 and 5");
     int N, C, H, W, D;
-    ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
     PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
     PADDLE_ENFORCE_EQ(scale->dims()[0], C);
@@ -219,7 +220,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 
     std::vector<int> dims;
     std::vector<int> strides;
-    if (tensor_format == TensorFormat::NCHW) {
+    if (data_layout == DataLayout::kNCHW) {
       dims = {N, C, H, W, D};
       strides = {C * H * W * D, H * W * D, W * D, D, 1};
     } else {
diff --git a/paddle/operators/batch_norm_op.h b/paddle/operators/batch_norm_op.h
index 8d99b6864776e81b30e87c09028b336309cf2838..a817ef41fc87da33ad87923c99a75ee7c3c7bbfe 100644
--- a/paddle/operators/batch_norm_op.h
+++ b/paddle/operators/batch_norm_op.h
@@ -19,21 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-enum TensorFormat {
-  NHWC = 0,
-  NCHW = 1,
-};
-
-inline TensorFormat StringToTensorFormat(const std::string& str) {
-  if (str == "NHWC" || str == "nhwc") {
-    return TensorFormat::NHWC;
-  } else if (str == "NCHW" || str == "nchw") {
-    return TensorFormat::NCHW;
-  } else {
-    PADDLE_THROW("Unknown storage order string: %s", str);
-  }
-}
-
 template <typename DeviceContext, typename T>
 class BatchNormKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc
index c796a0c5d089499e7858c7a427825fdbeb05cb7f..72e05607b0b612807d552b4c45b58f9d9ce9c2af 100644
--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/beam_search_decode_op.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -55,7 +56,10 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
                      const framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& dev_place) const override {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& dev_ctx = *pool.Get(dev_place);
+
     framework::ExecutionContext ctx(*this, scope, dev_ctx);
 
     const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
@@ -83,9 +87,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
 
 class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BeamSearchDecodeOpProtoMaker(framework::OpProto* proto,
-                               framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  BeamSearchDecodeOpProtoMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Ids",
              "(LodTensorArray)"
              "score of the candidate words in each step");
@@ -120,13 +123,13 @@ class BeamSearchDecodeInferShape : public framework::InferShapeBase {
 
 class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind& op_desc,
-                  framework::BlockDescBind* block) const override {
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
     for (auto& o : op_desc.Output("SentenceIds")) {
-      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
     }
     for (auto& o : op_desc.Output("SentenceScores")) {
-      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/operators/beam_search_op.cc b/paddle/operators/beam_search_op.cc
index 8c3e2a303fb8f12a8886c11cf112b859a6db7bcf..2e0513b37a24b9737532b3a71f8f0724fbdd2c13 100644
--- a/paddle/operators/beam_search_op.cc
+++ b/paddle/operators/beam_search_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/beam_search_op.h"
 
@@ -153,8 +153,7 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
 class BeamSearchProtoAndCheckerMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  BeamSearchProtoAndCheckerMaker(framework::OpProto *proto,
-                                 framework::OpAttrChecker *op_checker)
+  BeamSearchProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     // inputs and outputs stored in proto
     AddInput("pre_ids", "ids in previous step");
diff --git a/paddle/operators/beam_search_op.h b/paddle/operators/beam_search_op.h
index cc556bfe42ab12d73c0eb503d033efc272b5dd68..08b551ef9bd63106ed222d3a956a912294f827ec 100644
--- a/paddle/operators/beam_search_op.h
+++ b/paddle/operators/beam_search_op.h
@@ -189,7 +189,7 @@ class BeamSearchOp : public framework::OperatorBase {
   }
 
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& dev_place) const override {
     LOG(INFO) << "run beam search op";
     auto ids_var = scope.FindVar(Input("ids"));
     auto scores_var = scope.FindVar(Input("scores"));
diff --git a/paddle/operators/bilinear_tensor_product_op.cc b/paddle/operators/bilinear_tensor_product_op.cc
index 217fd523667777f7d250295d2a036867dac94f04..7640147a12d66a924f16eaf168227b6ce6a96040 100644
--- a/paddle/operators/bilinear_tensor_product_op.cc
+++ b/paddle/operators/bilinear_tensor_product_op.cc
@@ -65,8 +65,7 @@ class BilinearTensorProductOp : public framework::OperatorWithKernel {
 
 class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BilinearTensorProductOpMaker(framework::OpProto* proto,
-                               framework::OpAttrChecker* op_checker)
+  BilinearTensorProductOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The first input of bilinear_tensor_product operator.");
     AddInput("Y", "The second input of bilinear_tensor_product operator.");
diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc
index d641b8fc9fea81d1e364ae05de98ed7760a32648..446976edafca56f3c56fe573c8b5ef76a333089f 100644
--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/cast_op.h"
 #include "paddle/framework/op_registry.h"
@@ -20,8 +20,7 @@ namespace operators {
 
 class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CastOpProtoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  CastOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of cast op");
     AddOutput("Out", "The output tensor of cast op");
@@ -53,14 +52,14 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto grad = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto grad = new framework::OpDesc();
     grad->SetType("cast");
     grad->SetInput("X", OutputGrad("Out"));
     grad->SetOutput("Out", InputGrad("X"));
     grad->SetAttr("out_dtype", GetAttr("in_dtype"));
     grad->SetAttr("in_dtype", GetAttr("out_dtype"));
-    return std::unique_ptr<framework::OpDescBind>(grad);
+    return std::unique_ptr<framework::OpDesc>(grad);
   }
 };
 
diff --git a/paddle/operators/cast_op.cu b/paddle/operators/cast_op.cu
index 91e6fb391c637cc0d70a401d8d834451059ef6df..d68bbe6e39a2fbaa92787731145ae324288b981a 100644
--- a/paddle/operators/cast_op.cu
+++ b/paddle/operators/cast_op.cu
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/cast_op.h"
 
diff --git a/paddle/operators/cast_op.h b/paddle/operators/cast_op.h
index a6773f13a8deb443b022c6045f1b3b976b3e6607..9f39d91edd49d236d74019ca81b42002e4f35d36 100644
--- a/paddle/operators/cast_op.h
+++ b/paddle/operators/cast_op.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -55,7 +55,7 @@ class CastOpKernel : public framework::OpKernel<InT> {
     auto* in = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
     framework::VisitDataType(
-        static_cast<framework::DataType>(context.Attr<int>("out_dtype")),
+        static_cast<framework::proto::DataType>(context.Attr<int>("out_dtype")),
         CastOpFunctor<DeviceContext, InT>(
             in, out, context.template device_context<DeviceContext>()));
   }
diff --git a/paddle/operators/chunk_eval_op.cc b/paddle/operators/chunk_eval_op.cc
index 94127ab33e51d5529b63b5e3696032ef8adcf03e..a040404266c3cd44230b141cfed1aaede3f05187 100644
--- a/paddle/operators/chunk_eval_op.cc
+++ b/paddle/operators/chunk_eval_op.cc
@@ -32,6 +32,13 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
                    "Output(Recall) of ChunkEvalOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("F1-Score"),
                    "Output(F1-Score) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("NumInferChunks"),
+                   "Output(NumInferChunks) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("NumLabelChunks"),
+                   "Output(NumLabelChunks) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NumCorrectChunks"),
+        "Output(NumCorrectChunks) of ChunkEvalOp should not be null.");
 
     auto inference_dim = ctx->GetInputDim("Inference");
     auto label_dim = ctx->GetInputDim("Label");
@@ -42,20 +49,22 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Precision", {1});
     ctx->SetOutputDim("Recall", {1});
     ctx->SetOutputDim("F1-Score", {1});
+    ctx->SetOutputDim("NumInferChunks", {1});
+    ctx->SetOutputDim("NumLabelChunks", {1});
+    ctx->SetOutputDim("NumCorrectChunks", {1});
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(framework::DataType::FP32,
+    return framework::OpKernelType(framework::proto::DataType::FP32,
                                    ctx.device_context());
   }
 };
 
 class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ChunkEvalOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ChunkEvalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Inference",
              "(Tensor, default: Tensor<int64_t>). "
@@ -70,6 +79,16 @@ class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
               "sensitivity) of chunks on the given mini-batch.");
     AddOutput("F1-Score",
               "(float). The evaluated F1-Score on the given mini-batch.");
+    AddOutput("NumInferChunks",
+              "(int64_t). The number of chunks in Inference on the given "
+              "mini-batch.");
+    AddOutput(
+        "NumLabelChunks",
+        "(int64_t). The number of chunks in Label on the given mini-batch.");
+    AddOutput(
+        "NumCorrectChunks",
+        "(int64_t). The number of chunks both in Inference and Label on the "
+        "given mini-batch.");
     AddAttr<int>("num_chunk_types",
                  "(int). The number of chunk type. See below for details.");
     AddAttr<std::string>(
diff --git a/paddle/operators/chunk_eval_op.h b/paddle/operators/chunk_eval_op.h
index 9cd758a8253914515437b480e17a94d5d6b21fd2..74ab435c860b22b2ee3f485743540976a7a31b96 100644
--- a/paddle/operators/chunk_eval_op.h
+++ b/paddle/operators/chunk_eval_op.h
@@ -111,9 +111,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     std::vector<Segment> label_segments;
     std::vector<Segment> output_segments;
     std::set<int> excluded_chunk_types;
-    int64_t num_output_segments = 0;
-    int64_t num_label_segments = 0;
-    int64_t num_correct = 0;
+
     if (context.Attr<std::string>("chunk_scheme") == "IOB") {
       num_tag_types = 2;
       tag_begin = 0;
@@ -151,12 +149,24 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     auto* precision = context.Output<Tensor>("Precision");
     auto* recall = context.Output<Tensor>("Recall");
     auto* f1 = context.Output<Tensor>("F1-Score");
+    auto* num_infer_chunks = context.Output<Tensor>("NumInferChunks");
+    auto* num_label_chunks = context.Output<Tensor>("NumLabelChunks");
+    auto* num_correct_chunks = context.Output<Tensor>("NumCorrectChunks");
 
     const int64_t* inference_data = inference->data<int64_t>();
     const int64_t* label_data = label->data<int64_t>();
     T* precision_data = precision->mutable_data<T>(context.GetPlace());
     T* racall_data = recall->mutable_data<T>(context.GetPlace());
     T* f1_data = f1->mutable_data<T>(context.GetPlace());
+    int64_t* num_infer_chunks_data =
+        num_infer_chunks->mutable_data<int64_t>(context.GetPlace());
+    int64_t* num_label_chunks_data =
+        num_label_chunks->mutable_data<int64_t>(context.GetPlace());
+    int64_t* num_correct_chunks_data =
+        num_correct_chunks->mutable_data<int64_t>(context.GetPlace());
+    *num_infer_chunks_data = 0;
+    *num_label_chunks_data = 0;
+    *num_correct_chunks_data = 0;
 
     auto lod = label->lod();
     PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
@@ -166,17 +176,23 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     for (int i = 0; i < num_sequences; ++i) {
       int seq_length = lod[0][i + 1] - lod[0][i];
       EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
-                 output_segments, label_segments, num_output_segments,
-                 num_label_segments, num_correct, num_chunk_types,
-                 num_tag_types, other_chunk_type, tag_begin, tag_inside,
-                 tag_end, tag_single, excluded_chunk_types);
+                 output_segments, label_segments, *num_infer_chunks_data,
+                 *num_label_chunks_data, *num_correct_chunks_data,
+                 num_chunk_types, num_tag_types, other_chunk_type, tag_begin,
+                 tag_inside, tag_end, tag_single, excluded_chunk_types);
     }
-    *precision_data = !num_output_segments ? 0 : static_cast<T>(num_correct) /
-                                                     num_output_segments;
-    *racall_data = !num_label_segments ? 0 : static_cast<T>(num_correct) /
-                                                 num_label_segments;
-    *f1_data = !num_correct ? 0 : 2 * (*precision_data) * (*racall_data) /
-                                      ((*precision_data) + (*racall_data));
+    *precision_data = !(*num_infer_chunks_data)
+                          ? 0
+                          : static_cast<T>(*num_correct_chunks_data) /
+                                (*num_infer_chunks_data);
+    *racall_data = !(*num_label_chunks_data)
+                       ? 0
+                       : static_cast<T>(*num_correct_chunks_data) /
+                             (*num_label_chunks_data);
+    *f1_data = !(*num_correct_chunks_data)
+                   ? 0
+                   : 2 * (*precision_data) * (*racall_data) /
+                         ((*precision_data) + (*racall_data));
   }
 
   void EvalOneSeq(const int64_t* output, const int64_t* label, int length,
diff --git a/paddle/operators/clip_by_norm_op.cc b/paddle/operators/clip_by_norm_op.cc
index 0b7975a63f7d364bf9b0ce529e2dd72d9f3cd2e9..b90921d79baa920f0b6f92cde2f7e1ca9183d0d2 100644
--- a/paddle/operators/clip_by_norm_op.cc
+++ b/paddle/operators/clip_by_norm_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/clip_by_norm_op.h"
 
@@ -37,8 +37,7 @@ class ClipByNormOp : public framework::OperatorWithKernel {
 
 class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ClipByNormOpMaker(framework::OpProto* proto,
-                    framework::OpAttrChecker* op_checker)
+  ClipByNormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor) The input of clip_by_norm op."
diff --git a/paddle/operators/clip_by_norm_op.cu b/paddle/operators/clip_by_norm_op.cu
index acd75438230715420470b81f7a5e5953bd8b8abe..cbf8fa44133739f948fed13e18fc5cbaabd3abb0 100644
--- a/paddle/operators/clip_by_norm_op.cu
+++ b/paddle/operators/clip_by_norm_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/clip_by_norm_op.h"
 
diff --git a/paddle/operators/clip_by_norm_op.h b/paddle/operators/clip_by_norm_op.h
index d8db1566b0e8c9c351d3b6d6aca1d22d991fe76e..87956a707cf58afa2336602b8ab6acf73b0ff814 100644
--- a/paddle/operators/clip_by_norm_op.h
+++ b/paddle/operators/clip_by_norm_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index 6092212de4635e2ada81f8383a0ccf64a8116158..573bb9c7dfdac2366c2458dd9f27a035a9f9b813 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/clip_op.h"
 
@@ -38,7 +38,7 @@ class ClipOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ClipOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor)The input of clip op."
diff --git a/paddle/operators/clip_op.cu b/paddle/operators/clip_op.cu
index bb7dcc671a46758a6bd09e8035cf8d3f5e464b3b..5ccbc9643407c65e8734711744ceac9814f4c6a2 100644
--- a/paddle/operators/clip_op.cu
+++ b/paddle/operators/clip_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/clip_op.h"
 
diff --git a/paddle/operators/clip_op.h b/paddle/operators/clip_op.h
index 0c40797410950641d3d509a4980d5c4bdbd75cff..51db185dffd80cc3b839d063acaf3f936d732817 100644
--- a/paddle/operators/clip_op.h
+++ b/paddle/operators/clip_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/compare_op.cc b/paddle/operators/compare_op.cc
index bf7e88368157d29e627c3c06384f28b6e5e4ecc1..44665b7872acab8178552e5504916408cf566d13 100644
--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/compare_op.h"
 #include "paddle/framework/op_registry.h"
@@ -20,8 +20,7 @@ namespace operators {
 template <typename OpComment>
 class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CompareOpProtoMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  CompareOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     OpComment comment;
     AddInput("X",
@@ -67,9 +66,9 @@ class CompareOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = OperatorWithKernel::GetKernelType(ctx);
+    framework::OpKernelType kt = OperatorWithKernel::GetActualKernelType(ctx);
     // CompareOp kernel's device type is decided by input tensor place
     kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
     return kt;
diff --git a/paddle/operators/compare_op.cu b/paddle/operators/compare_op.cu
index 596a878bcf9f5b81c87c3bd419a2f46c0a450635..26049271befd1fe57001659d1a406e73de0004a7 100644
--- a/paddle/operators/compare_op.cu
+++ b/paddle/operators/compare_op.cu
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/compare_op.h"
 
diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h
index a56536e155531ac9ea3d17256210bdb9f4212181..567e89c0a727ad0cdd2add8ec8b2a42c86a58007 100644
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <math.h>
diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
index cf522d6921ee746d03d8082b8fc4d051f4d504e6..32b61edfd0dd163e5ef8f3d1de133c55314458b5 100644
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -58,7 +58,7 @@ class ConcatOp : public framework::OperatorWithKernel {
 
 class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ConcatOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ConcatOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input tensors of concat operator.").AsDuplicable();
     AddOutput("Out", "Output tensor of concat operator.");
@@ -98,8 +98,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
-            ops::ConcatOpGrad)
+REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
+               ops::ConcatOpGrad, false)
 REGISTER_OP_CPU_KERNEL(concat,
                        ops::ConcatKernel<paddle::platform::CPUPlace, float>)
 REGISTER_OP_CPU_KERNEL(concat_grad,
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index b809bdc3a0fea727f2fb6ea0a55672ee9b0bbd04..e333002bfd1ab40c62882f09cd207a12a0939648 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/cond_op.h"
-
 #include "paddle/operators/gather.h"
 #include "paddle/operators/scatter.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -193,20 +193,22 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
   }
 }
 
-void CondOp::Run(const Scope& scope,
-                 const platform::DeviceContext& dev_ctx) const {
+void CondOp::Run(const Scope& scope, const platform::Place& place) const {
+  // get device context from pool
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& dev_ctx = *pool.Get(place);
+
   PrepareDataForSubnet(scope, dev_ctx);
   std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
   for (int i = 0; i < BRANCH_NUM; ++i) {
-    sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
+    sub_net_op_[i]->Run(*sub_scopes[i], place);
   }
   MergeDataFromSubnet(scope, dev_ctx);
 }
 
 class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CondOpProtoAndCheckerMaker(framework::OpProto* proto,
-                             framework::OpAttrChecker* op_checker)
+  CondOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Cond", "The condition, which is a bool vector");
     AddInput("Xs", "Inputs of Subnets").AsDuplicable();
diff --git a/paddle/operators/cond_op.h b/paddle/operators/cond_op.h
index 93121fb31be287794249b5a62386d5a8dd268a0c..7dcdc47e0b2ff216bea92d083fe5897009384d39 100644
--- a/paddle/operators/cond_op.h
+++ b/paddle/operators/cond_op.h
@@ -78,7 +78,7 @@ class CondOp : public framework::OperatorBase {
   }
 
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override;
+           const platform::Place& place) const override;
 
  private:
   const int TRUE_BRANCH = 0;
diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc
index 6f2ef9174e84a0c0ae096956c04039435e6583c6..3cae61a438431e72cb24d714c761676cc0c3a41f 100644
--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <algorithm>
 #include "paddle/framework/executor.h"
 #include "paddle/framework/op_registry.h"
@@ -51,7 +51,7 @@ class ConditionalBlockOp : public ConditionalOp {
                      const framework::AttributeMap &attrs)
       : ConditionalOp(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto xs = InputTensors(scope);
     bool need_run = std::all_of(
         xs.begin(), xs.end(),
@@ -65,8 +65,8 @@ class ConditionalBlockOp : public ConditionalOp {
       scopes->front() = &scope.NewScope();
       auto &cur_scope = *scopes->front();
 
-      auto *block = Attr<framework::BlockDescBind *>("sub_block");
-      framework::Executor exec(dev_ctx);
+      framework::Executor exec(dev_place);
+      auto *block = Attr<framework::BlockDesc *>("sub_block");
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
     }
   }
@@ -74,8 +74,7 @@ class ConditionalBlockOp : public ConditionalOp {
 
 class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ConditionalBlockOpProtoMaker(framework::OpProto *proto,
-                               framework::OpAttrChecker *op_checker)
+  ConditionalBlockOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The conditional variable of this operator. If X is empty, the "
@@ -87,7 +86,7 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
               "(std::vector<Scope*>) The step scope of conditional block. To "
               "unify the conditional block, rnn and while op, the type of "
               "scope is std::vector<Scope*>");
-    AddAttr<framework::BlockDescBind *>(
+    AddAttr<framework::BlockDesc *>(
         "sub_block", "The step block of conditional block operator");
     AddComment(R"DOC(Conditional block operator
 
@@ -105,7 +104,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
                          const framework::AttributeMap &attrs)
       : ConditionalOp(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto xs = this->InputTensors(scope);
     bool need_run = std::all_of(
         xs.begin(), xs.end(),
@@ -117,21 +116,21 @@ class ConditionalBlockGradOp : public ConditionalOp {
       auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
       framework::Scope &cur_scope = *scopes[0];
 
-      auto *block = Attr<framework::BlockDescBind *>("sub_block");
-      framework::Executor exec(dev_ctx);
+      framework::Executor exec(dev_place);
+      auto *block = Attr<framework::BlockDesc *>("sub_block");
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
 
-      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("Params"),
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"),
                                   Outputs(framework::GradVarName("Params")));
 
-      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("X"),
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"),
                                   Outputs(framework::GradVarName("X")));
     }
   }
 
  private:
   void AssignLocalGradientToGlobal(
-      const platform::DeviceContext &dev_ctx, const framework::Scope &cur_scope,
+      const platform::Place &place, const framework::Scope &cur_scope,
       const std::vector<std::string> &p_names,
       const std::vector<std::string> &pg_names) const {
     for (size_t i = 0; i < p_names.size(); ++i) {
@@ -145,7 +144,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
       auto assign = framework::OpRegistry::CreateOp(
           "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}},
           framework::AttributeMap{});
-      assign->Run(cur_scope, dev_ctx);
+      assign->Run(cur_scope, place);
       cur_scope.Rename(new_in_grad_name, in_grad_name);
     }
   }
@@ -171,18 +170,19 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto grad_op = new framework::OpDesc();
     grad_op->SetType("conditional_block_grad");
     grad_op->SetInput("X", Input("X"));
     grad_op->SetInput("Params", Input("Params"));
     grad_op->SetInput("Out", Output("Out"));
     grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     grad_op->SetInput("Scope", Output("Scope"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    grad_op->SetOutput(framework::GradVarName("Params"), InputGrad("Params"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
+    grad_op->SetOutput(framework::GradVarName("Params"),
+                       InputGrad("Params", false));
     grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]);
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/conv_cudnn_op.cc b/paddle/operators/conv_cudnn_op.cc
index 008bf01885ecddd1fee76a33c43370d07a8988a2..84d9ce1973a4cccadcb8f78feaecbcaa9e7af312 100644
--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/conv_op.h"
 
@@ -19,8 +19,7 @@ namespace operators {
 
 class CudnnConv2DOpMaker : public Conv2DOpMaker {
  public:
-  CudnnConv2DOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  CudnnConv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : Conv2DOpMaker(proto, op_checker) {
     AddAttr<int>("workspace_size_MB",
                  "workspace size for cudnn, in MB, "
@@ -34,8 +33,7 @@ class CudnnConv2DOpMaker : public Conv2DOpMaker {
 
 class CudnnConv3DOpMaker : public Conv3DOpMaker {
  public:
-  CudnnConv3DOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  CudnnConv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : Conv3DOpMaker(proto, op_checker) {
     AddAttr<int>("workspace_size_MB",
                  "workspace size for cudnn, in MB, "
diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc
index 3da0a9001aafbb5b2c4b9a91c4527d9437ac38a1..0aa7dd48cafc3e2387ac902882d84ce9029cfcd0 100644
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
@@ -36,7 +36,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
     auto* input = ctx.Input<Tensor>("Input");
     auto* filter = ctx.Input<Tensor>("Filter");
     auto* output = ctx.Output<Tensor>("Output");
@@ -130,7 +130,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, algo, &workspace_size_in_bytes));
     // Allocate on GPU memory
-    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv forward ---------------------
     T alpha = 1.0f, beta = 0.0f;
@@ -151,7 +151,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
     auto input = ctx.Input<Tensor>("Input");
     auto filter = ctx.Input<Tensor>("Filter");
     auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
@@ -277,7 +277,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv workspace ---------------------
     // Already on GPU
     void* cudnn_workspace = nullptr;
-    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv backward data ---------------------
     T alpha = 1.0f, beta = 0.0f;
@@ -315,6 +315,10 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
+REGISTER_OP_KERNEL(conv2d, CUDNN, paddle::platform::CUDAPlace,
+                   paddle::operators::CudnnConvOpKernel<float>,
+                   paddle::operators::CudnnConvOpKernel<double>);
+
 REGISTER_OP_CUDA_KERNEL(conv2d_cudnn,
                         paddle::operators::CudnnConvOpKernel<float>,
                         paddle::operators::CudnnConvOpKernel<double>);
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index 7ef805fd44bf94d3279ffa50f86993b3f2b64412..ab52a41b539236f1691ce8bc02d31e336ee4ccbb 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/conv_op.h"
 
@@ -66,8 +66,7 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
   ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
 }
 
-Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
-                             framework::OpAttrChecker* op_checker)
+Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput(
       "Input",
@@ -138,8 +137,7 @@ $$
 )DOC");
 }
 
-Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
-                             framework::OpAttrChecker* op_checker)
+Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput(
       "Input",
diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc
index 38615a8befab91633423b7cd8536253a0d049ac3..4f942444f3eb5584f07399b8d1b4d6a5087496d4 100644
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/conv_op.h"
 
diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h
index d2de4e80f751d4938ac9cad60871b470fccf225c..83786e2329e7ae3c2908fdfdaeb1f79d19a53f47 100644
--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
@@ -50,14 +50,12 @@ inline bool IsExpand(std::vector<int64_t>& filter_dim,
 // operator implementations can reuse the code.
 class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv2DOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
 class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv3DOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
 class ConvOp : public framework::OperatorWithKernel {
diff --git a/paddle/operators/conv_shift_op.cc b/paddle/operators/conv_shift_op.cc
index a4150a5664690e750d2501a1849767c23209186b..106b68a0a0e787a0c9da2de924f4646c77b42b41 100644
--- a/paddle/operators/conv_shift_op.cc
+++ b/paddle/operators/conv_shift_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/conv_shift_op.h"
 #include "paddle/framework/eigen.h"
@@ -75,8 +75,7 @@ class ConvShiftGradOp : public framework::OperatorWithKernel {
 
 class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ConvShiftOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ConvShiftOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
diff --git a/paddle/operators/conv_shift_op.cu b/paddle/operators/conv_shift_op.cu
index f7ca82ce2635f9ef9d7e9a062d148448e61c163c..cf7abc196e1293ab1b998d1a8cb9c361a7c2d427 100644
--- a/paddle/operators/conv_shift_op.cu
+++ b/paddle/operators/conv_shift_op.cu
@@ -1,16 +1,16 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/conv_shift_op.h"
 #include "paddle/operators/math/math_function.h"
diff --git a/paddle/operators/conv_shift_op.h b/paddle/operators/conv_shift_op.h
index 1a70b38a0d8cb82ad1f818148306b7ec5f334744..6781d87ef0d99a0b0fc4747245920b6a38a33804 100644
--- a/paddle/operators/conv_shift_op.h
+++ b/paddle/operators/conv_shift_op.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/op_registry.h"
diff --git a/paddle/operators/conv_transpose_cudnn_op.cc b/paddle/operators/conv_transpose_cudnn_op.cc
index 4cb6a2ccffc76066ea0868f76ba2a3bfb9e5e450..2e5333a265f2f59f31c651b8bb080599ec6e31a4 100644
--- a/paddle/operators/conv_transpose_cudnn_op.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/conv_transpose_op.h"
 
@@ -19,11 +19,8 @@ namespace operators {
 
 class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
  public:
-  CudnnConv2DTransposeOpMaker(framework::OpProto* proto,
-                              framework::OpAttrChecker* op_checker)
+  CudnnConv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : Conv2DTransposeOpMaker(proto, op_checker) {
-    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
-        .SetDefault({1, 1});
     AddAttr<int>("workspace_size_MB",
                  "workspace size for cudnn, in MB, "
                  "workspace is a section of GPU memory which will be "
@@ -36,11 +33,8 @@ class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
 
 class CudnnConv3DTransposeOpMaker : public Conv3DTransposeOpMaker {
  public:
-  CudnnConv3DTransposeOpMaker(framework::OpProto* proto,
-                              framework::OpAttrChecker* op_checker)
+  CudnnConv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : Conv3DTransposeOpMaker(proto, op_checker) {
-    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
-        .SetDefault({1, 1, 1});
     AddAttr<int>("workspace_size_MB",
                  "workspace size for cudnn, in MB, "
                  "workspace is a section of GPU memory which will be "
diff --git a/paddle/operators/conv_transpose_cudnn_op.cu.cc b/paddle/operators/conv_transpose_cudnn_op.cu.cc
index f0297f6c40c132c28b50184997d657451f26362b..fc37776ba1ed35aa6b2523eb593e9713cfcc54eb 100644
--- a/paddle/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
@@ -35,7 +35,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
     auto* input = ctx.Input<Tensor>("Input");
     auto* filter = ctx.Input<Tensor>("Filter");
     auto* output = ctx.Output<Tensor>("Output");
@@ -100,7 +100,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
             cudnn_output_desc, algo, &workspace_size_in_bytes));
 
     // Allocate on GPU memory
-    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
 
     // ------------------- cudnn conv transpose forward ---------------------
@@ -120,7 +120,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
     auto input = ctx.Input<Tensor>("Input");
     auto filter = ctx.Input<Tensor>("Filter");
     auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
@@ -201,7 +201,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv workspace ---------------------
     // Already on GPU
     void* cudnn_workspace = nullptr;
-    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv backward data ---------------------
     // FIXME(typhoonzero): template type T may not be the same as cudnn call.
diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc
index ca063e94bbe64817567a298c3b1ad9306667536d..74636d138f1e40474a1cc5453609dafe14fcaaab 100644
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/conv_transpose_op.h"
 
@@ -29,6 +29,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
   auto filter_dims = ctx->GetInputDim("Filter");
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
 
   PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
                  "ConvTransposeOp intput should be 4-D or 5-D tensor.");
@@ -41,20 +42,24 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
                     "ConvTransposeOp paddings dimension and strides "
                     "dimension should be the same.");
+  PADDLE_ENFORCE_EQ(paddings.size(), dilations.size(),
+                    "ConvTransposeOp paddings dimension and dilations "
+                    "dimension should be the same.");
   PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
                     "In ConvTransposeOp, The input channel should be the same "
                     "as the number of filters.");
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
   for (size_t i = 0; i < strides.size(); ++i) {
+    auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
     output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
-                           filter_dims[i + 2]);
+                           filter_extent);
   }
   ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
 }
 
-Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
-    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto,
+                                               OpAttrChecker* op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput(
       "Input",
@@ -73,6 +78,12 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
   AddOutput("Output",
             "(Tensor) The output tensor of convolution transpose operator. "
             "The format of output tensor is also NCHW.");
+
+  AddAttr<std::vector<int>>("dilations",
+                            "(vector<int> default:{1, 1}), the "
+                            "dilations(h_dilation, w_dilation) of convolution "
+                            "transpose operator.")
+      .SetDefault({1, 1});
   AddAttr<std::vector<int>>(
       "strides",
       "(vector<int> default:{1, 1}), the strides(h_stride, w_stride) of "
@@ -87,7 +98,7 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
 Convolution2D Transpose Operator.
 
 The convolution transpose operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
+and dilations, strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
 Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the
 number of channels, H is the height of the feature, and W is the width of the feature.
@@ -112,8 +123,8 @@ Example:
 )DOC");
 }
 
-Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
-    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto,
+                                               OpAttrChecker* op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput("Input",
            "(Tensor) The input tensor of convolution transpose operator."
@@ -136,6 +147,13 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
             "Where N is batch size, C is "
             "the number of channels, D is the depth of the feature, H is the "
             "height of the feature, and W is the width of the feature.");
+
+  AddAttr<std::vector<int>>(
+      "dilations",
+      "(vector<int> default:{1, 1, 1}), the "
+      "dilations(d_dilation,h_dilation, w_dilation) of convolution "
+      "transpose operator.")
+      .SetDefault({1, 1, 1});
   AddAttr<std::vector<int>>("strides",
                             "(vector<int> default:{1, 1, 1}), the "
                             "strides{d_stride, h_stride, w_stride} of "
@@ -149,7 +167,7 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
 Convolution3D Transpose Operator.
 
 The convolution transpose operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
+and dilations, strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
 Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the
 number of channels, D is the depth of the feature, H is the height of the feature,
diff --git a/paddle/operators/conv_transpose_op.cu.cc b/paddle/operators/conv_transpose_op.cu.cc
index b91ebd7922f2e101df8d6ef5892a62ec5a10cf99..f1d827c606283440debb9a0edb25168816a3a08c 100644
--- a/paddle/operators/conv_transpose_op.cu.cc
+++ b/paddle/operators/conv_transpose_op.cu.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/conv_transpose_op.h"
 
diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h
index 1171b0435fd2b1abe541043e8283a8fc09dc13c7..4c8f8a80672788e8b2919e500d3627adec1ad035 100644
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -30,14 +30,12 @@ using DDim = framework::DDim;
 // operator implementations can reuse the code.
 class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv2DTransposeOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker);
+  Conv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
 class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv3DTransposeOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker);
+  Conv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
 class ConvTransposeOp : public framework::OperatorWithKernel {
@@ -63,6 +61,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
 
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
     // groups will alway be disabled in conv2dtranspose.
 
     const int batch_size = static_cast<int>(input->dims()[0]);
@@ -115,7 +114,6 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
 
     math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
     math::Col2VolFunctor<DeviceContext, T> col2vol;
-    std::vector<int> dilations({1, 1, 1});
 
     // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
     // on input)
@@ -167,6 +165,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
 
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
@@ -221,7 +220,6 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
 
       math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
       math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      std::vector<int> dilations({1, 1, 1});
 
       if (input_grad) {
         input_grad->mutable_data<T>(context.GetPlace());
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
index 440c427cba9396ec6d0ebf7814d671e45f45412d..9019a1edb379be4007e38d3c0dc71feae23ae4e8 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/cos_sim_op.h"
 
@@ -62,7 +62,7 @@ class CosSimOp : public framework::OperatorWithKernel {
 
 class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CosSimOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  CosSimOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The 1st input of cos_sim op.");
     AddInput("Y", "The 2nd input of cos_sim op.");
diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu
index 1cb01f5945f691747bac609ca4a93e2d15cde5bf..9e5d1b6e4f0b6e482edd96df93d535e05dba3bc6 100644
--- a/paddle/operators/cos_sim_op.cu
+++ b/paddle/operators/cos_sim_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/cos_sim_op.h"
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
index fecb5a79b2397dd73d991a1a87efcf84d60ef882..e2b6282c0913e8ad16f8e3f6c3054f9567822d15 100644
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc
index 1ce189fa6ebba3712467572c55d599975bbe7534..024e1d061a5b2eabc27110d5379cb8226a104079 100644
--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
@@ -18,8 +18,7 @@ namespace paddle {
 namespace operators {
 class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CRFDecodingOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  CRFDecodingOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Emission",
              "(LoDTensor, default: LoDTensor<float>). A LoDTensor with shape "
@@ -121,12 +120,18 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
         ctx.device_context());
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::OpKernelType& actual_kernel_type) const override {
+    return framework::OpKernelType(actual_kernel_type.data_type_,
+                                   platform::CPUPlace());
+  }
 };
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
index 7c2a0ac7a705e5aac3d181545f8dfc8881e811f2..310e351443112c340054cf092cd2443b309ec49c 100644
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/crop_op.h"
 #include <boost/lexical_cast.hpp>
@@ -52,7 +52,7 @@ class CropOp : public framework::OperatorWithKernel {
 
 class CropOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CropOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  CropOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input of pad op. "
@@ -88,7 +88,8 @@ There are two ways to set shape:
 
 The input should be a k-D tensor(k > 0 and k < 7). As an example:
 
-Given:
+Case 1:
+Given
 
     X = [[0, 1, 2, 0, 0]
          [0, 3, 4, 0, 0]
@@ -107,6 +108,27 @@ we get:
     Out = [[1, 2],
            [3, 4]].
 
+
+Case 2:
+Given
+
+    X = [[0, 1, 2, 5, 0]
+         [0, 3, 4, 6, 0]
+         [0, 0, 0, 0, 0]],
+
+and
+
+    offsets = [0, 1],
+
+and
+
+    Y = [[0, 0, 0]
+         [0, 0, 0]],
+
+we get:
+
+    Out = [[1, 2, 5],
+           [3, 4, 6]].
 )DOC");
   }
 };
diff --git a/paddle/operators/crop_op.cu b/paddle/operators/crop_op.cu
index 90fd83ca10b750896a9fe144d3c30fabb2f54e0a..bba5db4c6ce682cb00482e35fa1e340aba83e37f 100644
--- a/paddle/operators/crop_op.cu
+++ b/paddle/operators/crop_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/crop_op.h"
diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h
index d531a19c783d2768d24142bb7b974ccfc2b39350..69d1a92977250b4e8a64b47ac66444724fbc53f6 100644
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 CropdleCropdle Authors. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 2b06012b690c6725fd150cd99e992912655dc9c6..fe39cb481aa63fa401603d97778b337282511ab5 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -51,7 +51,7 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of computation kernel of cross_entropy
   // is determined by its input "X".
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
@@ -101,7 +101,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of computation kernel of cross_entropy
   // is determined by its input "X".
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
@@ -111,19 +111,18 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
 
 class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CrossEntropyOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
-             "where N is the batch size and D is the number of classes. "
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x D],"
+             " where N is the batch size and D is the number of classes. "
              "This input is a probability computed by the previous operator, "
              "which is almost always the result of a softmax operator.");
     AddInput("Label",
              "(Tensor), the ground truth which is a 2-D tensor. When "
              "soft_label is set to false, Label is a Tensor<int64> with shape "
              "[N x 1]. When soft_label is set to true, Label is a "
-             "Tensor<float/double> with shape [N x K].");
+             "Tensor<float/double> with shape [N x D].");
     AddOutput("Y",
               "(Tensor, default Tensor<float>), a 2-D tensor with shape "
               "[N x 1]. The cross entropy loss.");
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 05469645880fa466a2a3324ad1b7a8b9d681c440..3b04894e6ccb08c13e2d24bb38196fdc7935bf9e 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/cross_entropy_op.h"
 
diff --git a/paddle/operators/decayed_adagrad_op.cc b/paddle/operators/decayed_adagrad_op.cc
index fd29c7270b0442da740a74f83fdfeed8f47f830d..739a8d881c35817756421a3299901c9e5e7d96ba 100644
--- a/paddle/operators/decayed_adagrad_op.cc
+++ b/paddle/operators/decayed_adagrad_op.cc
@@ -55,8 +55,7 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
 
 class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  DecayedAdagradOpMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
+  DecayedAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
diff --git a/paddle/operators/decayed_adagrad_op.cu b/paddle/operators/decayed_adagrad_op.cu
index 282b90f275ad1542d5941e001dbf646348fc01b6..7bc8161f2339572c2a9284f865846b9b7e594354 100644
--- a/paddle/operators/decayed_adagrad_op.cu
+++ b/paddle/operators/decayed_adagrad_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/decayed_adagrad_op.h"
diff --git a/paddle/operators/detail/recv_impl.cc b/paddle/operators/detail/recv_impl.cc
index 89dc5045221156eed7aa9411bc96ad86f91136d2..b746f9df4640bac7b5c4e83091f2c2ab67e778be 100644
--- a/paddle/operators/detail/recv_impl.cc
+++ b/paddle/operators/detail/recv_impl.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "send_recv_impl.h"
 
@@ -20,25 +20,57 @@ namespace detail {
 
 Status SendRecvServerImpl::SendVariable(ServerContext *context,
                                         const VariableMessage *in_var,
-                                        VariableMessage *out_var) {
-  framework::LoDTensor t;
-  // TODO(typhoonzero): desirealize in_tensor and run pserver network.
+                                        VoidMessage *out_var) {
+  // TODO(typhoonzero): support different variable types.
   std::istringstream iss(in_var->serialized());
+  framework::LoDTensor t;
   framework::DeserializeFromStream(iss, &t);
-  lodtensor_queue_.Push(std::move(t));
-  // Block util the sub graph is done.
-  t = lodtensor_return_queue_.Pop();
+  TensorWithName tensor_with_name =
+      std::make_pair(in_var->varname(), std::move(t));
+
+  var_recv_queue_.Push(std::move(tensor_with_name));
+  return Status::OK;
+}
+
+Status SendRecvServerImpl::GetVariable(ServerContext *context,
+                                       const VariableMessage *in_var,
+                                       VariableMessage *out_var) {
+  std::string get_var_name = in_var->varname();
+  auto *var = scope_->FindVar(get_var_name);
+  auto tensor = var->Get<framework::LoDTensor>();
   std::ostringstream oss;
-  // FIXME(typhoonzero): get context from op.
-  framework::SerializeToStream(oss, t, platform::CPUDeviceContext());
+  framework::SerializeToStream(oss, tensor, platform::CPUDeviceContext());
+
   std::string *varname = out_var->mutable_varname();
-  *varname = in_var->varname();
+  *varname = get_var_name;
   std::string *serialized = out_var->mutable_serialized();
   *serialized = oss.str();
+  return Status::OK;
+}
 
+Status SendRecvServerImpl::Wait(ServerContext *context,
+                                const VoidMessage *in_var,
+                                VoidMessage *out_var) {
+  {
+    std::unique_lock<std::mutex> lock(this->mutex_);
+    condition_.wait(lock, [=] { return this->done_ == true; });
+  }
   return Status::OK;
 }
 
+void SendRecvServerImpl::Reset() {
+  std::lock_guard<std::mutex> lock(this->mutex_);
+  done_ = false;
+}
+
+void SendRecvServerImpl::Done() {
+  {
+    std::lock_guard<std::mutex> lock(this->mutex_);
+    done_ = true;
+  }
+  condition_.notify_all();
+}
+
 }  // namespace detail
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/detail/safe_ref.h b/paddle/operators/detail/safe_ref.h
index b71af17309f9f46b5c87f0f479d4e03443fa7f93..ff2a156f3d0e3c5c55354375ca81adf433a49686 100644
--- a/paddle/operators/detail/safe_ref.h
+++ b/paddle/operators/detail/safe_ref.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/detail/send_impl.cc b/paddle/operators/detail/send_impl.cc
index da1ddf75d2afb85670c5ea0c9884376415f28208..a812fcf39bc19a6e06dc594c13076696e7949372 100644
--- a/paddle/operators/detail/send_impl.cc
+++ b/paddle/operators/detail/send_impl.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "send_recv_impl.h"
 
@@ -19,10 +19,10 @@ namespace operators {
 namespace detail {
 
 bool RPCClient::SendVariable(const framework::Scope& scope,
-                             const std::string& inname,
-                             const std::string& outname) {
+                             const std::string& inname) {
   ClientContext context;
-  VariableMessage msg, out_msg;
+  VariableMessage msg;
+  VoidMessage out_msg;
   // FIXME(typhoonzero): pass device context to here.
   auto ctx = platform::CPUDeviceContext();
   auto* var = scope.FindVar(inname);
@@ -37,9 +37,26 @@ bool RPCClient::SendVariable(const framework::Scope& scope,
   msg.set_serialized(oss.str());
   Status status = stub_->SendVariable(&context, msg, &out_msg);
   if (!status.ok()) {
+    LOG(ERROR) << "gRPC error: " << status.error_message();
     return false;
   }
-  std::istringstream iss(out_msg.serialized());
+  return true;
+}
+
+bool RPCClient::GetVariable(const framework::Scope& scope,
+                            const std::string& outname) {
+  ClientContext context;
+  VariableMessage call_msg, ret_msg;
+  call_msg.set_varname(outname);
+  auto ctx = platform::CPUDeviceContext();
+  Status status = stub_->GetVariable(&context, call_msg, &ret_msg);
+  if (!status.ok()) {
+    LOG(ERROR) << "gRPC error: " << status.error_message();
+    return false;
+  }
+
+  std::istringstream iss(ret_msg.serialized());
+
   framework::LoDTensor ret_tensor;
   framework::DeserializeFromStream(iss, &ret_tensor);
   auto* outvar = scope.FindVar(outname);
@@ -49,6 +66,12 @@ bool RPCClient::SendVariable(const framework::Scope& scope,
   return true;
 }
 
+void RPCClient::Wait() {
+  ClientContext context;
+  VoidMessage call_msg, ret_msg;
+  stub_->Wait(&context, call_msg, &ret_msg);
+}
+
 }  // namespace detail
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto
index 07ff9d2c621a2dfb51792821a0d3fc398c315835..95c8e708986eac2a27ece70864efd7eac5f84ce8 100644
--- a/paddle/operators/detail/send_recv.proto
+++ b/paddle/operators/detail/send_recv.proto
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 syntax = "proto3";
 
@@ -19,7 +19,12 @@ package sendrecv;
 service SendRecvService {
   // For parameter server round-robin like hashing, do not split tensors.
   // Send and recv only one tensor
-  rpc SendVariable(VariableMessage) returns (VariableMessage) {}
+  // TODO(typhoonzero): add streaming API
+  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
+  // Argument VariableMessage for GetVariable should only contain varname.
+  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+  // wait for one execution of the program
+  rpc Wait(VoidMessage) returns (VoidMessage) {}
 }
 
 // VariableMessage is serialized paddle variable message.
diff --git a/paddle/operators/detail/send_recv_impl.h b/paddle/operators/detail/send_recv_impl.h
index b9a5340a8636db7b5d6ec7b21368632d3916b4aa..47f730f7ae897096fbdd23a55252448cf6655fb4 100644
--- a/paddle/operators/detail/send_recv_impl.h
+++ b/paddle/operators/detail/send_recv_impl.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -20,10 +20,6 @@
 #include "paddle/framework/selected_rows.h"
 #include "paddle/operators/detail/simple_block_queue.h"
 
-// #include <grpc++/channel.h>
-// #include <grpc++/client_context.h>
-// #include <grpc++/create_channel.h>
-// #include <grpc++/security/credentials.h>
 #include "paddle/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/operators/detail/send_recv.pb.h"
 
@@ -48,24 +44,34 @@ namespace paddle {
 namespace operators {
 namespace detail {
 
+typedef std::pair<std::string, framework::LoDTensor> TensorWithName;
+
 class SendRecvServerImpl final : public SendRecvService::Service {
  public:
   explicit SendRecvServerImpl() {}
 
   Status SendVariable(ServerContext *context, const VariableMessage *in_var,
-                      VariableMessage *out_var) override;
+                      VoidMessage *out_var) override;
+  Status GetVariable(ServerContext *context, const VariableMessage *in_var,
+                     VariableMessage *out_var) override;
+  Status Wait(ServerContext *context, const VoidMessage *in_var,
+              VoidMessage *out_var) override;
+  void Reset();
+  void Done();
+  void SetScope(framework::Scope *scope) { scope_ = scope; };
 
-  const framework::LoDTensor Get() { return this->lodtensor_queue_.Pop(); }
+  const TensorWithName Get() { return this->var_recv_queue_.Pop(); }
 
-  void Push(const framework::LoDTensor &tensor) {
-    this->lodtensor_return_queue_.Push(tensor);
-  }
+  void Push(const TensorWithName &msg) { this->var_recv_queue_.Push(msg); }
 
  private:
-  SimpleBlockQueue<framework::LoDTensor> lodtensor_queue_;
-  SimpleBlockQueue<framework::LoDTensor> lodtensor_return_queue_;
-  SimpleBlockQueue<framework::SelectedRows> selected_rows_queue_;
-  SimpleBlockQueue<framework::SelectedRows> selected_rows_return_queue_;
+  // received variable from RPC, operators fetch variable from this queue.
+  SimpleBlockQueue<TensorWithName> var_recv_queue_;
+  framework::Scope *scope_;
+  // condition of the sub program
+  std::mutex mutex_;
+  bool done_;
+  std::condition_variable condition_;
 };
 
 // RPCClient is a class to send tensors to pserver sub-network
@@ -75,8 +81,9 @@ class RPCClient {
   RPCClient(std::shared_ptr<Channel> channel)
       : stub_(SendRecvService::NewStub(channel)) {}
 
-  bool SendVariable(const framework::Scope &scope, const std::string &inname,
-                    const std::string &outname);
+  bool SendVariable(const framework::Scope &scope, const std::string &inname);
+  bool GetVariable(const framework::Scope &scope, const std::string &outname);
+  void Wait();
 
  private:
   std::unique_ptr<SendRecvService::Stub> stub_;
diff --git a/paddle/operators/detail/simple_block_queue.h b/paddle/operators/detail/simple_block_queue.h
index 44899217579532af2c1d2e6074ec0e08231e7b86..c7f5ff4b5f494ce80b4ee792afb44841e9c8a2f2 100644
--- a/paddle/operators/detail/simple_block_queue.h
+++ b/paddle/operators/detail/simple_block_queue.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/detail/strided_memcpy.h b/paddle/operators/detail/strided_memcpy.h
index 068c82f399316a1587d7322d8dab75823656800e..9ed524d4dcf7f8bd4607281ade34e9d56f409085 100644
--- a/paddle/operators/detail/strided_memcpy.h
+++ b/paddle/operators/detail/strided_memcpy.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/ddim.h"
@@ -35,7 +35,7 @@ struct StridedMemcpyFunctor<T, 1> {
       memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
     } else {
 #ifdef PADDLE_WITH_CUDA
-      auto& gpu_place = boost::get<platform::GPUPlace>(place);
+      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
       auto& cuda_ctx =
           reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
       memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head,
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index acd526ae8047292ce6c6756f174c80053dca0d9f..35cb18797ff66cb87a6658e73ce02b0bfae29baa 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/dropout_op.h"
 
@@ -25,8 +25,6 @@ class DropoutOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE_GE(ctx->Attrs().Get<float>("dropout_prob"), 0);
-    PADDLE_ENFORCE_LE(ctx->Attrs().Get<float>("dropout_prob"), 1);
 
     auto x_dims = ctx->GetInputDim("X");
     ctx->SetOutputDim("Out", x_dims);
@@ -40,15 +38,18 @@ class DropoutOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  DropoutOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of dropout op.");
     AddOutput("Out", "The output of dropout op.");
     AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate();
 
     AddAttr<float>("dropout_prob", "Probability of setting units to zero.")
-        .SetDefault(.5f);
+        .SetDefault(.5f)
+        .AddCustomChecker([](const float& drop_p) {
+          PADDLE_ENFORCE(drop_p >= 0.0f && drop_p <= 1.0f,
+                         "'dropout_prob' must be between 0.0 and 1.0.");
+        });
     AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
     AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
 
@@ -79,8 +80,6 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) must not be null.");
 
-    PADDLE_ENFORCE_GE(ctx->Attrs().Get<float>("dropout_prob"), 0);
-    PADDLE_ENFORCE_LE(ctx->Attrs().Get<float>("dropout_prob"), 1);
     auto x_dims = ctx->GetInputDim("X");
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
     PADDLE_ENFORCE_EQ(x_dims, out_dims,
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
index 10c670751d026ef92e01aad7da31a8f59b8514c0..c56930336e865079f1b96df0f35b0a051fe63a27 100644
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include <thrust/device_ptr.h>
@@ -30,16 +30,15 @@ struct MaskGenerator {
   __host__ __device__ MaskGenerator(AttrType dropout_prob, int seed)
       : dropout_prob(dropout_prob), seed(seed) {}
 
-  __host__ __device__ T operator()(const unsigned int n) const {
+  inline __host__ __device__ T operator()(const unsigned int n) const {
     thrust::minstd_rand rng;
     rng.seed(seed);
     thrust::uniform_real_distribution<AttrType> dist(0, 1);
     rng.discard(n);
     if (dist(rng) < dropout_prob) {
       return static_cast<T>(0);
-    } else {
-      return static_cast<T>(1);
     }
+    return static_cast<T>(1);
   }
 };
 
@@ -71,7 +70,7 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
       auto M = EigenMatrix<T>::Reshape(*mask, 1);
       Y.device(place) = X * M;
     } else {
-      Y.device(place) = X * dropout_prob;
+      Y.device(place) = X * (1.0f - dropout_prob);
     }
   }
 };
diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h
index 84ad39f0bb639975365d427aa205411ef79ecd46..c90b8d277eb78048c001d36a367287146b51c636 100644
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <random>
@@ -57,7 +57,7 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
       auto Y = EigenMatrix<T>::Reshape(*y, 1);
       auto& place =
           *context.template device_context<DeviceContext>().eigen_device();
-      Y.device(place) = X * dropout_prob;
+      Y.device(place) = X * (1.0f - dropout_prob);
     }
   }
 };
diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/operators/elementwise_add_op.cc
index a62eeeeb95fef77c00258403ca1cae11c2db7173..70b7c9f2ec11bf8ad56a24324a53792955edc77d 100644
--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/operators/elementwise_add_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/elementwise_add_op.h"
 #include "paddle/operators/elementwise_op.h"
@@ -19,8 +19,7 @@ namespace paddle {
 namespace operators {
 class ElementwiseAddOpMaker : public ElementwiseOpMaker {
  public:
-  ElementwiseAddOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseAddOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
     SetComment("Add", "$Out = X + Y$");
     AddComment(comment_);
diff --git a/paddle/operators/elementwise_add_op.cu b/paddle/operators/elementwise_add_op.cu
index 78642bb4246e7328dd3e2d902aca88615d598ddf..641cea323acee549898cb6f0245ccac4c069ce32 100644
--- a/paddle/operators/elementwise_add_op.cu
+++ b/paddle/operators/elementwise_add_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/elementwise_add_op.h"
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h
index 069bdaf0ab7469b0a814ca5f68b444b9ce4904f1..59abbb57d1dcfbef6ead70e4afa9a3816d60d9b5 100644
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/operators/elementwise_div_op.cc
index 1c3e9e70eef0c1adfb89cf1a58437092f8d536d7..1fa960866fa2066a351ef2e65a3c77cf8b6595f7 100644
--- a/paddle/operators/elementwise_div_op.cc
+++ b/paddle/operators/elementwise_div_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/elementwise_div_op.h"
 #include "paddle/operators/elementwise_op.h"
@@ -19,8 +19,7 @@ namespace paddle {
 namespace operators {
 class ElementwiseDivOpMaker : public ElementwiseOpMaker {
  public:
-  ElementwiseDivOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseDivOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
     SetComment("Div", "$Out = X / Y$");
     AddComment(comment_);
diff --git a/paddle/operators/elementwise_div_op.cu b/paddle/operators/elementwise_div_op.cu
index 502c52893667e246a19bb04c8bf3ed3df3265f2d..a0372123d6ffe7e3a90727ddd37a787c1030e0bd 100644
--- a/paddle/operators/elementwise_div_op.cu
+++ b/paddle/operators/elementwise_div_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/elementwise_div_op.h"
diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h
index d91313db4225d8fe051856345367a15867bdf215..875abd313ffc8fdf910d461922ff41f65ef276e7 100644
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index aadb95cbe35fe565cf1009f0f9765def921d0906..a6d11736194cb79bdc247c721acf8bda9c81dbe5 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/elementwise_mul_op.h"
 #include "paddle/operators/elementwise_op.h"
@@ -20,8 +20,7 @@ namespace operators {
 
 class ElementwiseMulOpMaker : public ElementwiseOpMaker {
  public:
-  ElementwiseMulOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
     SetComment("Mul", "$Out = X \\odot\\ Y$");
     AddComment(comment_);
diff --git a/paddle/operators/elementwise_mul_op.cu b/paddle/operators/elementwise_mul_op.cu
index 089451b3e1288b3adc689a3c7d9fea2bc5243407..f73e8afda960a89aff8568eab66b0f120db2e342 100644
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/elementwise_mul_op.h"
diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h
index 16fa5ec4b3a369805acb401bae5407072101af8d..3ee50207c07fa2b7ccf2c002903a4f055dbfb352 100644
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/operators/elementwise_op_function.h"
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
index ea533503e4916cae7e1157ed34da9629dcff3513..f308ee05e11210540e41cda4b9a896f9f96c4730 100644
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -43,8 +43,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
 
 class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ElementwiseOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The first input tensor of elementwise op");
     AddInput("Y", "(Tensor) The second input tensor of elementwise op");
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h
index 7ebfc7df8c117edd7bcf14cc5ae6ba3dc1302c03..560247cb108dce5432bfe66556b9e675a3accc27 100644
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -103,10 +103,12 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
 
   MidWiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
     ++j_;
-    i_ = j_ / post_;
-    if (UNLIKELY(i_ == n_)) {
+    if (UNLIKELY(j_ == post_)) {
+      ++i_;
       j_ = 0;
-      i_ = 0;
+      if (UNLIKELY(i_ == n_)) {
+        i_ = 0;
+      }
     }
     return *this;
   }
@@ -125,10 +127,10 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
 
  private:
   const T* ptr_;
-  int i_;
+  int64_t i_;
   int64_t j_;
   int64_t n_;
-  int post_;
+  int64_t post_;
 };
 
 #ifdef __NVCC__
diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/operators/elementwise_sub_op.cc
index 3e4d19361ead0100e45e50880d402e3d2b8557ff..2a8d0845b1800277a7d3cd6ff6c5c984e92197ee 100644
--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/operators/elementwise_sub_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/elementwise_sub_op.h"
 #include "paddle/operators/elementwise_op.h"
@@ -19,8 +19,7 @@ namespace paddle {
 namespace operators {
 class ElementwiseSubOpMaker : public ElementwiseOpMaker {
  public:
-  ElementwiseSubOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseSubOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
     SetComment("Sub", "$Out = X - Y$");
     AddComment(comment_);
diff --git a/paddle/operators/elementwise_sub_op.cu b/paddle/operators/elementwise_sub_op.cu
index 0b2f0f7d4d98f1336087f9fc3fc485ed8d805b5f..7a2516ef6a6d5739e9f122455d289cbfeaaf2549 100644
--- a/paddle/operators/elementwise_sub_op.cu
+++ b/paddle/operators/elementwise_sub_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/elementwise_sub_op.h"
diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h
index 731a30c5e30d3f9bbdbabd62e5d9a77559500b06..66edf8672d13086f883f0a2ad7ef5802317cc79a 100644
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/operators/elementwise_op_function.h"
diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc
index 8b3cddbb944de250d5754a2be64dd8e7ec53003a..08fa91ed72aa41ed2f513c090b9085410bb5cc47 100644
--- a/paddle/operators/expand_op.cc
+++ b/paddle/operators/expand_op.cc
@@ -55,7 +55,7 @@ class ExpandOp : public framework::OperatorWithKernel {
 
 class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ExpandOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  ExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."
diff --git a/paddle/operators/expand_op.cu b/paddle/operators/expand_op.cu
index 99ee584d0859f9bf688899cc9b346d221415518c..84e8fa567b80599d9687fed516eac6fbb308b24a 100644
--- a/paddle/operators/expand_op.cu
+++ b/paddle/operators/expand_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 
diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h
index 14ef8b0912860f7ec39535997c39d6d4c4970650..1d9012cd4a4c6ad596e7d434b5c4ecea1ddcde87 100644
--- a/paddle/operators/expand_op.h
+++ b/paddle/operators/expand_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   You may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index ee43c22fb13e203c7de1a7e6d1586423fcbfb25a..48da52c3b68879a1da8550a5448090f9f1e715d3 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/op_registry.h"
@@ -25,7 +25,7 @@ class FeedOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto feed_var_name = Input("X");
     auto *feed_var = scope.FindVar(feed_var_name);
 
@@ -47,15 +47,19 @@ class FeedOp : public framework::OperatorBase {
     auto &feed_list = feed_var->Get<framework::FeedFetchList>();
     auto &feed_item = feed_list.at(static_cast<size_t>(col));
     auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
-    framework::CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx, out_item);
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    framework::CopyFrom(feed_item, place, dev_ctx, out_item);
     out_item->set_lod(feed_item.lod());
   }
 };
 
 class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FeedOpInfoMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  FeedOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of feed op");
     AddOutput("Out", "The output of feed op");
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index 1ae07194c235ce6724f59c9c60df80f957787cda..387d1e0a747f71d85826b52d140c2838112227f6 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -1,19 +1,20 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -26,7 +27,7 @@ class FetchOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto fetch_var_name = Input("X");
     auto *fetch_var = scope.FindVar(fetch_var_name);
     PADDLE_ENFORCE(fetch_var != nullptr,
@@ -51,6 +52,9 @@ class FetchOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): Should we assume the fetch operator always generate
     // CPU outputs?
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
     CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
     dev_ctx.Wait();
     dst_item.set_lod(src_item.lod());
@@ -61,8 +65,7 @@ class FetchOp : public framework::OperatorBase {
 
 class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FetchOpInfoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  FetchOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of fetch op");
     AddOutput("Out", "The output of fetch op");
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc
index 7fb74e2b950338fbd05515f844959862504eddce..852ecdfe45e7f4737a505c1f722d25457ad6ad32 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -49,10 +49,10 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
         ctx.device_context());
   }
 };
@@ -60,13 +60,12 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
 class FillConstantBatchSizeLikeOpMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  FillConstantBatchSizeLikeOpMaker(framework::OpProto *proto,
-                                   framework::OpAttrChecker *op_checker)
+  FillConstantBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
     AddInput("Input",
              "(Tensor) Tensor "
              "whose dim_idx th dimension is used to specify the batch_size");
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
index 2e0e15f36bb2e0ffd33dc6d1d25965d0cbe33186..608f4b91623e4ddf0240c37be7a8e56117dd40f2 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cu.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/fill_constant_batch_size_like_op.h"
 #include "paddle/framework/op_registry.h"
diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc
index 3d5f84bc239615797a5cf01a74150fdb7dfc1b80..dcd43a30c86b62d79f52ac640f14b295a062146c 100644
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -33,8 +34,9 @@ class FillConstantOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
-    auto data_type = static_cast<framework::DataType>(Attr<int>("dtype"));
+           const platform::Place &dev_place) const override {
+    auto data_type =
+        static_cast<framework::proto::DataType>(Attr<int>("dtype"));
     auto value = Attr<float>("value");
     auto force_cpu = Attr<bool>("force_cpu");
     auto &out =
@@ -44,21 +46,23 @@ class FillConstantOp : public framework::OperatorBase {
       auto cpu = platform::CPUPlace();
       out.mutable_data(cpu, framework::ToTypeIndex(data_type));
     } else {
-      out.mutable_data(dev_ctx.GetPlace(), framework::ToTypeIndex(data_type));
+      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
     }
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
     math::set_constant(dev_ctx, &out, value);
   }
 };
 
 class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FillConstantOpMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  FillConstantOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
     AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
     AddAttr<float>("value", "(float, default 0) The value to be filled")
         .SetDefault(0.0f);
diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc
index 382e161c5d83ba560411b1f231aa896028b709b8..084ba1db62de0a6bf6829f8e9f4c274fb777e879 100644
--- a/paddle/operators/fill_op.cc
+++ b/paddle/operators/fill_op.cc
@@ -1,20 +1,21 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -42,21 +43,20 @@ class FillOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto &out =
         detail::Ref(detail::Ref(scope.FindVar(Output("Out")),
                                 "Cannot find variable %s", Output("Out"))
                         .GetMutable<framework::LoDTensor>());
     out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
-    auto dtype = static_cast<framework::DataType>(Attr<int>("dtype"));
+    auto dtype = static_cast<framework::proto::DataType>(Attr<int>("dtype"));
     platform::CPUPlace cpu;
     auto force_cpu = Attr<bool>("force_cpu");
-    out.mutable_data(force_cpu ? cpu : dev_ctx.GetPlace(),
-                     framework::ToTypeIndex(dtype));
+    out.mutable_data(force_cpu ? cpu : place, framework::ToTypeIndex(dtype));
 
     framework::LoDTensor tensor;
 
-    if (force_cpu || platform::is_cpu_place(dev_ctx.GetPlace())) {
+    if (force_cpu || platform::is_cpu_place(place)) {
       tensor.ShareDataWith(out);
     } else {
       // Always make tensor in CPU memory.
@@ -67,16 +67,19 @@ class FillOp : public framework::OperatorBase {
     framework::VisitDataType(
         dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
 
-    if (!force_cpu && platform::is_gpu_place(dev_ctx.GetPlace())) {
+    if (!force_cpu && platform::is_gpu_place(place)) {
       // Copy tensor to out
-      framework::CopyFrom(tensor, dev_ctx.GetPlace(), dev_ctx, &out);
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
+      framework::CopyFrom(tensor, place, dev_ctx, &out);
     }
   }
 };
 
 class FillOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FillOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  FillOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddComment(R"DOC(Fill operator
 
@@ -88,7 +91,7 @@ Fill an tensor with `value` and `shape`. The type of the tensor is specify by
         "value", "The float values of tensor, which are flatten in row major");
     AddAttr<std::vector<int>>("shape", "The shape of output tensor");
     AddAttr<int>("dtype", "The data type of output tensor, Default is float")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
     AddAttr<bool>("force_cpu",
                   "Whether the output tensor must be at CPU memory or not. "
                   "Default is false.")
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
index 720c11f5f12a8dea971fe82db6afe8f6b0d9ee1a..b4ae1de876010effff6bf577a4e33043f6760a4f 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -24,20 +24,19 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of FillZerosLikeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"),
-                   "Output(Y) of FillZerosLikeOp should not be null.");
-    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Y");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FillZerosLikeOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
 class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FillZerosLikeOpMaker(framework::OpProto *proto,
-                       framework::OpAttrChecker *op_checker)
+  FillZerosLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of fill-zeros-like op.");
-    AddOutput("Y", "The variable will be filled up with zeros.");
+    AddOutput("Out", "The variable will be filled up with zeros.");
     AddComment(R"DOC(
 FillZerosLike Operator.
 
diff --git a/paddle/operators/fill_zeros_like_op.cu.cc b/paddle/operators/fill_zeros_like_op.cu.cc
index 9f412306bb5f08497990f0e0385f695d838c2400..b7048e8f5857e646e16d5017593f5d3c6e79ea7e 100644
--- a/paddle/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/operators/fill_zeros_like_op.cu.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/fill_zeros_like_op.h"
 #include "paddle/framework/op_registry.h"
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
index a6e2941f52150de7886717303d2cb2f10b7eef7b..351ecf8b2f1d945fabdd1d6c5ed56f76f3caae61 100644
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -23,7 +23,7 @@ template <typename DeviceContext, typename T>
 class FillZerosLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Y");
+    auto* out = context.Output<framework::Tensor>("Out");
     out->mutable_data<T>(context.GetPlace());
 
     math::SetConstant<DeviceContext, T> setter;
diff --git a/paddle/operators/ftrl_op.cc b/paddle/operators/ftrl_op.cc
index b14913ff213c84051b5a945f4a470cea4039a289..d00700823d48eb2ea4fc64d1fa2989f18c7c5f18 100644
--- a/paddle/operators/ftrl_op.cc
+++ b/paddle/operators/ftrl_op.cc
@@ -57,7 +57,7 @@ class FTRLOp : public framework::OperatorWithKernel {
 
 class FTRLOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FTRLOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  FTRLOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
diff --git a/paddle/operators/gather.cu.h b/paddle/operators/gather.cu.h
index c806aa5f05ad214abb3484935d82b67880a1db7a..9840c066f053e5e1cd1c756d4dd938eace1a5eb4 100644
--- a/paddle/operators/gather.cu.h
+++ b/paddle/operators/gather.cu.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/tensor.h"
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index 8f80fb162519f60fcce897b3c31a3507bbf6ba6d..45e9d8df702403e66f9100e4edaf5c17470eb20d 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -40,7 +40,7 @@ class GatherOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
@@ -57,7 +57,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
@@ -67,7 +67,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
 
 class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GatherOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  GatherOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The source input of gather op");
     AddInput("Index", "The index input of gather op");
diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu
index b37f0576e276b2aa995f01de635ec153a0db36aa..eec2415e1de2434de0a920567863d421d2d3032d 100644
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "gather.cu.h"
 #include "paddle/framework/eigen.h"
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index 254c83e1378a121d99c89d9d8705935b5f06edc8..9ed493a7d027e1bd3e8c5fca376678fd5fcf14f1 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -1,13 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <random>
 #include "paddle/framework/op_registry.h"
@@ -57,18 +60,17 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
         ctx.device_context());
   }
 };
 
 class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GaussianRandomOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  GaussianRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Out", "Output matrix of gaussian random op");
 
@@ -91,7 +93,7 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("dtype",
                  "(int, default 5(FP32)) "
                  "Output data type.")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
 
     AddComment(R"DOC(
 GaussianRandom Operator.
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
index ffce6f713816abe7d1f207f141a1b0933574e2ff..8a70db17e17ebf7d5bad1e1ee6a2acdff1b85a09 100644
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
 
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/counting_iterator.h>
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include "paddle/framework/op_registry.h"
diff --git a/paddle/operators/gru_op.cc b/paddle/operators/gru_op.cc
index 311e7edcf1519bc706a51e4d9242a1ebee5168ca..76f2adefede3b4bc4035f86f8f8663eed29343ae 100644
--- a/paddle/operators/gru_op.cc
+++ b/paddle/operators/gru_op.cc
@@ -1,13 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/gru_op.h"
 
@@ -67,7 +70,7 @@ class GRUOp : public framework::OperatorWithKernel {
 
 class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GRUOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  GRUOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Input",
              "(LoDTensor) The first input is a LodTensor, which supports "
diff --git a/paddle/operators/gru_op.cu.cc b/paddle/operators/gru_op.cu.cc
index 458630ca6187ec89638046d8eea63c31eca518f2..9cb0cc42d5589792aae6d99cec807aac6e4991b6 100644
--- a/paddle/operators/gru_op.cu.cc
+++ b/paddle/operators/gru_op.cu.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/gru_op.h"
 
diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h
index 6d02dff578846904beeb58c5161d27c7c2ed5d70..c6228864d7ec042ff99e4521d1d707ba091e8ed5 100644
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc
index 705de87be5b67fbc343a89eeba2282941b264c8a..c354293be7720abd7d96b1c4311b32049a16730c 100644
--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/gru_unit_op.h"
 
@@ -71,8 +71,7 @@ class GRUUnitOp : public framework::OperatorWithKernel {
 
 class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GRUUnitOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  GRUUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Input",
              "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
diff --git a/paddle/operators/gru_unit_op.cu b/paddle/operators/gru_unit_op.cu
index 7c752db494b59c3ec2af093332777ce6655fb477..95c8c23dadadf0e053012c86d44346ee31565cfc 100644
--- a/paddle/operators/gru_unit_op.cu
+++ b/paddle/operators/gru_unit_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/gru_unit_op.h"
diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h
index 8fe60c750da0a42089dc38190d2dda3d08e5ba06..a77be46718b766d9a0a8b8fb4cf2316b44687db8 100644
--- a/paddle/operators/gru_unit_op.h
+++ b/paddle/operators/gru_unit_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/hinge_loss_op.cc b/paddle/operators/hinge_loss_op.cc
index 373b4d99b47f2a8ab06c7584a25acee59b6f3e3b..19d2e9dc56fe11f9dfb13e8cb271a23e128bf91b 100644
--- a/paddle/operators/hinge_loss_op.cc
+++ b/paddle/operators/hinge_loss_op.cc
@@ -46,8 +46,7 @@ class HingeLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HingeLossOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  HingeLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Logits",
              "The input value (Logits) of Hinge loss op."
diff --git a/paddle/operators/hinge_loss_op.cu b/paddle/operators/hinge_loss_op.cu
index 31a5bde292ebcab899ad05a813c685963dd5bc25..b9cfbc50c49c6cc902cb3667200c12c74fb5d13d 100644
--- a/paddle/operators/hinge_loss_op.cu
+++ b/paddle/operators/hinge_loss_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/hinge_loss_op.h"
diff --git a/paddle/operators/huber_loss_op.cc b/paddle/operators/huber_loss_op.cc
index 11828d083a55f0a38cf3b8513b7395bbb5592581..5c92f2c7b2d2f701bcc487716db41a0cce91002f 100644
--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/operators/huber_loss_op.cc
@@ -45,8 +45,7 @@ class HuberLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class HuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HuberLossOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  HuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input value of huber loss op."
diff --git a/paddle/operators/huber_loss_op.cu b/paddle/operators/huber_loss_op.cu
index d49a4d9d4236c402f2559c5a0a5de097c2edc61f..ccc83a16ba271f0aa879c5c075dce1932dd40494 100644
--- a/paddle/operators/huber_loss_op.cu
+++ b/paddle/operators/huber_loss_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/huber_loss_op.h"
diff --git a/paddle/operators/increment_op.cc b/paddle/operators/increment_op.cc
index 54911267e36dfdbc62d533f40f0b754e7d2cb7bf..e0b80cc4e74429dee1b9a25e41b116970ad4de2a 100644
--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
 
@@ -52,7 +52,7 @@ class IncrementOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &out =
         *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
@@ -70,8 +70,7 @@ class IncrementOp : public framework::OperatorBase {
 
 class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  IncrementOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  IncrementOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The input tensor of increment operator");
     AddOutput("Out", "(Tensor) The output tensor of increment operator.");
@@ -94,13 +93,13 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
  public:
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("increment");
     grad_op->SetInput("X", Output("Out"));
     grad_op->SetOutput("Out", Input("X"));
     grad_op->SetAttr("step", -boost::get<float>(GetAttr("step")));
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/is_empty_op.cc b/paddle/operators/is_empty_op.cc
index 54fecf44e881b5c283c81580fd161da9808d253e..492ae48845aa5aa123989e62d07f5ae899af6193 100644
--- a/paddle/operators/is_empty_op.cc
+++ b/paddle/operators/is_empty_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
@@ -29,7 +29,7 @@ class IsEmptyOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     // get input
     auto *var = scope.FindVar(Input(kInput));
     PADDLE_ENFORCE_NOT_NULL(var);
@@ -47,8 +47,7 @@ class IsEmptyOp : public framework::OperatorBase {
 
 class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  IsEmptyOpProtoMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  IsEmptyOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(kInput, "(Tensor) Tensor which is to be checked.");
     AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not.");
diff --git a/paddle/operators/l1_norm_op.cc b/paddle/operators/l1_norm_op.cc
index c0b51202c6bb708a682568175c56583394961535..1a5d6e19263325821dd220d8a31c0e34600b8220 100644
--- a/paddle/operators/l1_norm_op.cc
+++ b/paddle/operators/l1_norm_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/l1_norm_op.h"
 
@@ -48,7 +48,7 @@ class L1NormGradOp : public framework::OperatorWithKernel {
 
 class L1NormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  L1NormOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  L1NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The input of l1_norm op.");
     AddOutput("Out", "(Scalar) The output of l1_norm op.");
diff --git a/paddle/operators/l1_norm_op.cu b/paddle/operators/l1_norm_op.cu
index fd725f86f6c98c5aff844546361d8599ea3527ab..7ecc774670a8480da46ac688f3635e04b1ab7c1f 100644
--- a/paddle/operators/l1_norm_op.cu
+++ b/paddle/operators/l1_norm_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/l1_norm_op.h"
diff --git a/paddle/operators/l1_norm_op.h b/paddle/operators/l1_norm_op.h
index ae3878f2b7b079027a9e9145cefa9eae6b22ffbc..086d42705dceaf0cbd35ce8e5115156a76a0b6e8 100644
--- a/paddle/operators/l1_norm_op.h
+++ b/paddle/operators/l1_norm_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
index 896e3657d4406c5a1fe07f1712abb2ff0370fd3c..666207ea07628ca5f2a8313fa3f5febda140a294 100644
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -19,8 +19,7 @@ namespace operators {
 
 class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LinearChainCRFOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  LinearChainCRFOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Emission",
              "(LoDTensor, default LoDTensor<float>) "
@@ -184,7 +183,7 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of computation kernel of linear_chain_crf
   // is determined by its input "Emission".
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
@@ -243,7 +242,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of output of the linear_chain_crf_grad
   // operator is determined by its input: gradients of LogLikelihood.
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(
diff --git a/paddle/operators/linear_chain_crf_op.cu b/paddle/operators/linear_chain_crf_op.cu
index 3b105ec3414b5d63946331319d0f47a38e7908cc..da612510b4d45d8eefabe7de303e9fd0132c5f77 100644
--- a/paddle/operators/linear_chain_crf_op.cu
+++ b/paddle/operators/linear_chain_crf_op.cu
@@ -1,10 +1,10 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h
index 694584e79c3a1e818814a4a2145f52d8db7cf10a..19c6715ec877dea6dcf0babc7373333a4d9eed0f 100644
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
@@ -219,8 +219,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     // operators runs on GPU device.
     auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
                          Tensor* dst) {
-      dst->mutable_data<T>(platform::GPUPlace());
-      framework::CopyFrom(src, platform::GPUPlace(), ctx, dst);
+      dst->mutable_data<T>(platform::CUDAPlace());
+      framework::CopyFrom(src, platform::CUDAPlace(), ctx, dst);
     };
     copyTensor(ctx, emission_exps_src, emission_exps_dst);
     copyTensor(ctx, transition_exps_src, transition_exps_dst);
@@ -433,8 +433,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor* src,
                          Tensor* dst) {
       if (src && dst) {
-        dst->mutable_data<T>(platform::GPUPlace());
-        framework::CopyFrom(*src, platform::GPUPlace(), ctx, dst);
+        dst->mutable_data<T>(platform::CUDAPlace());
+        framework::CopyFrom(*src, platform::CUDAPlace(), ctx, dst);
       }
     };
     copyTensor(ctx, emission_grad_src, emission_grad_dst);
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
index 4e58b84430f2a8697bbbc1acf971fd063120f563..65f021d91931541b712bd46aebc06e68144b2af0 100644
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -1,20 +1,20 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fstream>
 
 #include "paddle/framework/op_registry.h"
-
-#include <fstream>
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -26,7 +26,7 @@ class LoadOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     std::ifstream fin(filename);
     PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
@@ -40,7 +40,9 @@ class LoadOp : public framework::OperatorBase {
     auto *tensor = out_var->GetMutable<framework::LoDTensor>();
     framework::DeserializeFromStream(fin, tensor);
 
-    auto place = dev_ctx.GetPlace();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
     if (platform::is_gpu_place(place)) {
       // copy CPU to GPU
       framework::LoDTensor cpu_tensor;
@@ -58,8 +60,7 @@ class LoadOp : public framework::OperatorBase {
 
 class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoadOpProtoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  LoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Out", "(Tensor) The tensor need to be loaded");
     AddAttr<std::string>("file_path",
diff --git a/paddle/operators/lod_array_length_op.cc b/paddle/operators/lod_array_length_op.cc
index b2f4ec57fadd2ba3dc8708abbfebaaeb67100f1e..d2c52745cfdf8d0fdb168ef2d90e75a515c31015 100644
--- a/paddle/operators/lod_array_length_op.cc
+++ b/paddle/operators/lod_array_length_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
@@ -26,7 +26,7 @@ class LoDArrayLengthOp : public framework::OperatorBase {
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
     auto &out =
         *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
@@ -38,8 +38,7 @@ class LoDArrayLengthOp : public framework::OperatorBase {
 
 class LoDArrayLengthProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDArrayLengthProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  LoDArrayLengthProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(LoDTensorArray) The input tensor array.");
     AddOutput("Out", "(Tensor) 1x1 CPU Tensor of length, int64_t");
diff --git a/paddle/operators/lod_rank_table_op.cc b/paddle/operators/lod_rank_table_op.cc
index f7d4db1947b83fecf57575e17fafe26795c92bdd..8711dd62c886fdada083d316d6aabc93a050ff82 100644
--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/operators/lod_rank_table_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/op_registry.h"
 namespace paddle {
@@ -24,19 +24,19 @@ class LoDRankTableOp : public framework::OperatorBase {
                  const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
     VLOG(10) << "Level = " << static_cast<size_t>(Attr<int>("level"));
     out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level")));
+    VLOG(10) << Input("X") << "'s lod information is " << *out;
   }
 };
 
 class LoDRankTableOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDRankTableOpProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  LoDRankTableOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(LoDTensor) input lod tensor, must contain lod information.");
@@ -63,11 +63,11 @@ class LoDRankTableInferShape : public framework::InferShapeBase {
 
 class LoDRankTableInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind &op_desc,
-                  framework::BlockDescBind *block) const override {
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
     for (auto &o : op_desc.Output("Out")) {
       block->FindRecursiveOrCreateVar(o)->SetType(
-          framework::VarDesc::LOD_RANK_TABLE);
+          framework::proto::VarDesc::LOD_RANK_TABLE);
     }
   }
 };
diff --git a/paddle/operators/lod_reset_op.cc b/paddle/operators/lod_reset_op.cc
index 32831cb1e2cf188a507773ef1e00b22de98d82ab..f3c0badf2a74431b980abd532e51ba3d251524a1 100644
--- a/paddle/operators/lod_reset_op.cc
+++ b/paddle/operators/lod_reset_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-       http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/lod_reset_op.h"
 
@@ -38,7 +38,7 @@ class LoDResetOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
@@ -48,8 +48,7 @@ class LoDResetOp : public framework::OperatorWithKernel {
 
 class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDResetOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  LoDResetOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(LoDTensor) The input tensor of lod_reset operator.");
     AddInput("TargetLoD",
@@ -98,7 +97,7 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
diff --git a/paddle/operators/lod_reset_op.cu b/paddle/operators/lod_reset_op.cu
index f7c235898096ffb3d6ba039cb3f01d5bc9ef5364..910866ea6330059f8e0b04e036e3b124e920b5c4 100644
--- a/paddle/operators/lod_reset_op.cu
+++ b/paddle/operators/lod_reset_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/lod_reset_op.h"
 
diff --git a/paddle/operators/lod_reset_op.h b/paddle/operators/lod_reset_op.h
index b86f8b13135fa809ade3b001434eda5d88375c2c..306373fb1fb6f16a0db7f0e836e38fd8c49f7e86 100644
--- a/paddle/operators/lod_reset_op.h
+++ b/paddle/operators/lod_reset_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-       http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc
index b970bf31773f4c6feb0010bd40ba906b388ec310..8d164b4abc54722a95a176dfe8ed341f8c5125d1 100644
--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -1,20 +1,21 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -32,7 +33,7 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
                      const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
                           Input("X"))
                   .Get<framework::LoDTensor>();
@@ -86,6 +87,11 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
         // out[i][offset: offset+len] = x[each_range.begin: each_range.end]
         auto slice = out[i].Slice(static_cast<int>(offset),
                                   static_cast<int>(offset + len));
+
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(place);
+
         framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
                                     static_cast<int>(each_range.end)),
                             x.place(), dev_ctx, &slice);
@@ -97,8 +103,7 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
 
 class LoDTensorToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDTensorToArrayOpProtoMaker(framework::OpProto *proto,
-                               framework::OpAttrChecker *op_checker)
+  LoDTensorToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "");
     AddInput("RankTable", "");
@@ -128,10 +133,10 @@ class LoDTensorToArrayInferShape : public framework::InferShapeBase {
 
 class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind &op_desc,
-                  framework::BlockDescBind *block) const override {
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
     for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
+      block->Var(out_var)->SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
     }
   }
 };
@@ -141,14 +146,14 @@ class LoDTensorToArrayGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("array_to_lod_tensor");
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetInput("RankTable", Input("RankTable"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/log_loss_op.cc b/paddle/operators/log_loss_op.cc
index 4524229a330a0ceddca673e2b2a6d836a15a2e3f..f714945354c5668f58e273dc8d6c7c16d51ac17d 100644
--- a/paddle/operators/log_loss_op.cc
+++ b/paddle/operators/log_loss_op.cc
@@ -46,8 +46,7 @@ class LogLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LogLossOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  LogLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Predicted",
              "The input value (Predicted) of Log loss op."
diff --git a/paddle/operators/log_loss_op.cu b/paddle/operators/log_loss_op.cu
index e87ac7d12a2b730085b4e9a33457612c4eba2655..be283e470052cc3a569be564ab4baa6bc5b75808 100644
--- a/paddle/operators/log_loss_op.cu
+++ b/paddle/operators/log_loss_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/log_loss_op.h"
diff --git a/paddle/operators/logical_op.cc b/paddle/operators/logical_op.cc
index c818d5e9c19abab15ebdc2b3485e03ab66cf649d..7417192479a13ca9537e2d40f9779a3bf5f1eb61 100644
--- a/paddle/operators/logical_op.cc
+++ b/paddle/operators/logical_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/logical_op.h"
 #include "paddle/framework/op_registry.h"
@@ -20,8 +20,7 @@ namespace operators {
 template <typename OpComment>
 class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BinaryLogicalOpProtoMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  BinaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     OpComment comment;
     AddInput("X",
@@ -45,8 +44,7 @@ Each element of Out is calculated by %s
 template <typename OpComment>
 class UnaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  UnaryLogicalOpProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  UnaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     OpComment comment;
     AddInput("X", string::Sprintf("(LoDTensor) Operand of %s operator",
@@ -101,9 +99,9 @@ class LogicalOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = OperatorWithKernel::GetKernelType(ctx);
+    framework::OpKernelType kt = OperatorWithKernel::GetActualKernelType(ctx);
     // LogicalOp kernel's device type is decided by input tensor place
     kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
     return kt;
diff --git a/paddle/operators/logical_op.cu b/paddle/operators/logical_op.cu
index 7fef60e0c9e957f28118e54d23c6043752d2f52f..87f2287b8f11aabe8afe87776eff49295c1ea2ac 100644
--- a/paddle/operators/logical_op.cu
+++ b/paddle/operators/logical_op.cu
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/logical_op.h"
 
diff --git a/paddle/operators/logical_op.h b/paddle/operators/logical_op.h
index 629388cac81e60c8b84197238018384ffc59a08f..413857685603c7b84e885135d9aadf7cc71a4f72 100644
--- a/paddle/operators/logical_op.h
+++ b/paddle/operators/logical_op.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <math.h>
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index 93e812ac5be5aea6bf3ab353d31480322c51ccbc..6e5cbd6f8cefc965d6c8d24b16eb3bafde55cc49 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/lookup_table_op.h"
 #include "paddle/framework/var_type_inference.h"
@@ -41,7 +41,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
@@ -51,8 +51,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
 
 class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LookupTableOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  LookupTableOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("W",
              "An input represents embedding tensors, "
@@ -99,7 +98,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
@@ -109,19 +108,20 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
 
 class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind& op_desc,
-                  framework::BlockDescBind* block) const override {
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
     auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
     auto attr = op_desc.GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
               << " is set to SelectedRows";
-      block->Var(out_var_name)->SetType(framework::VarDesc::SELECTED_ROWS);
+      block->Var(out_var_name)
+          ->SetType(framework::proto::VarDesc::SELECTED_ROWS);
     } else {
       VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
               << " is set to LoDTensor";
-      block->Var(out_var_name)->SetType(framework::VarDesc::LOD_TENSOR);
+      block->Var(out_var_name)->SetType(framework::proto::VarDesc::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
index 9431030a53975acafe9bcb22dc9164492929b07a..261a28da694bf551d8d9e630139680aebc4be51a 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -1,13 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
@@ -101,7 +104,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       // copy GPU memory to CPU pinned memory
       framework::Vector<int64_t> new_rows;
       new_rows.resize(ids_dim[0]);
-      auto gpu_place = boost::get<platform::GPUPlace>(context.GetPlace());
+      auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
 
       memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data,
                    ids_dim[0] * sizeof(int64_t), stream);
diff --git a/paddle/operators/lookup_table_op.h b/paddle/operators/lookup_table_op.h
index 99b912163b71594340d8917645dff107fd208aea..2fd3335868406455ec01f9ded6bacc7bda5e2a67 100644
--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
@@ -1,13 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/lrn_op.cc b/paddle/operators/lrn_op.cc
index b5b7bc940a85ac2bbb6c6b303284777df714b7d6..95673ba19e776b3c52eb492d0b14d761b584f807 100644
--- a/paddle/operators/lrn_op.cc
+++ b/paddle/operators/lrn_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/lrn_op.h"
 
@@ -140,7 +140,7 @@ class LRNOp : public framework::OperatorWithKernel {
 template <typename T>
 class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LRNOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  LRNOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor) The input of LRN operator. "
diff --git a/paddle/operators/lrn_op.cu b/paddle/operators/lrn_op.cu
index c6857c2b6d0a9011ef83d115e6edd81bf2f8a0ca..eb9d66a73dfe3e22f1151d73ce5e34f2eda0835e 100644
--- a/paddle/operators/lrn_op.cu
+++ b/paddle/operators/lrn_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/lrn_op.h"
 
diff --git a/paddle/operators/lrn_op.h b/paddle/operators/lrn_op.h
index 44063d3e036809eb236bbe7c46aa0cce06b46df0..ef3a2883a88ff321fb9a87ddaf31123a3b9ee90a 100644
--- a/paddle/operators/lrn_op.h
+++ b/paddle/operators/lrn_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   You may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc
index 2db7da30db416e03cf473c8e65b023d9265e9193..b8fcec0f29b46e838f91ad1ee0fded8e42f27bd5 100644
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@@ -92,7 +92,7 @@ class LSTMOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
@@ -102,7 +102,7 @@ class LSTMOp : public framework::OperatorWithKernel {
 
 class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LSTMOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  LSTMOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Input",
              "(LoDTensor) the first input is a LodTensor, which support "
@@ -260,7 +260,7 @@ class LSTMGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
diff --git a/paddle/operators/lstm_op.cu.cc b/paddle/operators/lstm_op.cu.cc
index 48519bed6f7d927b40d02683a7e9f2acfb8b85e5..cfcc1fc92a074c9bfe83e6c32560177edef12ae9 100644
--- a/paddle/operators/lstm_op.cu.cc
+++ b/paddle/operators/lstm_op.cu.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/lstm_op.h"
 
diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h
index 14abd4bf0a6e73a9c0f000f53a5e1e380f01d1c0..c57ee414dc5b3417549c8ac3a7fd57a9c8f452df 100644
--- a/paddle/operators/lstm_op.h
+++ b/paddle/operators/lstm_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/detail/activation_functions.h"
 #include "paddle/operators/math/lstm_compute.h"
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/sequence2batch.h"
@@ -102,9 +103,12 @@ class LSTMKernel : public framework::OpKernel<T> {
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
-    auto gate_act = ctx.Attr<std::string>("gate_activation");
-    auto cell_act = ctx.Attr<std::string>("cell_activation");
-    auto cand_act = ctx.Attr<std::string>("candidate_activation");
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
 
     for (size_t n = 0; n < num_batch; n++) {
       int bstart = static_cast<int>(batch_starts[n]);
@@ -264,9 +268,12 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
     batch_gate_g.set_lod(batch_gate->lod());
 
-    auto gate_act = ctx.Attr<std::string>("gate_activation");
-    auto cell_act = ctx.Attr<std::string>("cell_activation");
-    auto cand_act = ctx.Attr<std::string>("candidate_activation");
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/operators/lstm_unit_op.cc
index 18b9cdf2a39e8226c634194ff2cc56d169979774..c2d2c43982580c9724849d68576d42ffa44fc6b4 100644
--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/lstm_unit_op.h"
 
@@ -48,10 +48,12 @@ class LstmUnitOp : public framework::OperatorWithKernel {
 
 class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LstmUnitOpMaker(framework::OpProto* proto,
-                  framework::OpAttrChecker* op_checker)
+  LstmUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "FC input before the non-linear activation.");
+    AddInput("X",
+             "Lstm unit only applies non-linear activations, please make sure"
+             "that linear tranformation has already been applied to `X`. "
+             "Linear tranformation can be applied by adding a `fc` layer");
     AddInput(
         "C_prev",
         "The cell state tensor of last time-step in the Lstm Unit operator.");
diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu
index 291f2c295e78288c01c6575df936ceedceba7ce8..5ee5ddd280f7720c4583053e4e48a5043ab423f4 100644
--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/operators/lstm_unit_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 /* Acknowledgement: the following code is strongly inspired by
 https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.cu
@@ -98,7 +98,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
 
     auto* x_tensor = ctx.Input<framework::Tensor>("X");
     auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
@@ -129,7 +129,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
 
     auto x_tensor = ctx.Input<Tensor>("X");
     auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
diff --git a/paddle/operators/lstm_unit_op.h b/paddle/operators/lstm_unit_op.h
index 61705675d930369ea8d491229caa1b4046f3e16a..fa8d141bcb6ee4bfc9a29e337b7adbc5ecd3ad23 100644
--- a/paddle/operators/lstm_unit_op.h
+++ b/paddle/operators/lstm_unit_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 /* Acknowledgement: the following code is strongly inspired by
 https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op.h
diff --git a/paddle/operators/margin_rank_loss_op.cc b/paddle/operators/margin_rank_loss_op.cc
index 42e8961c0ea57650a823ee4b58516f66a455b385..e0df3077742bc330ce8510bf06b0411148f669d8 100644
--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/margin_rank_loss_op.h"
 
@@ -42,8 +42,7 @@ class MarginRankLossOp : public framework::OperatorWithKernel {
 template <typename T>
 class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MarginRankLossOpMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
+  MarginRankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X1",
              "(2-D tensor with shape [batch_size x 1]) The score for "
diff --git a/paddle/operators/margin_rank_loss_op.cu b/paddle/operators/margin_rank_loss_op.cu
index 1c2afccc5b32e22c939a275d8c69ad774d3ebdad..798c3ed182b08b07a779da88924bfc05743c680e 100644
--- a/paddle/operators/margin_rank_loss_op.cu
+++ b/paddle/operators/margin_rank_loss_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/margin_rank_loss_op.h"
 
diff --git a/paddle/operators/margin_rank_loss_op.h b/paddle/operators/margin_rank_loss_op.h
index 9c1f96cac13f1bdb8c5dfd3e771157d1d1c60e15..7438e881e1c69c9ef1f84b21e6ee0ba093f3378a 100644
--- a/paddle/operators/margin_rank_loss_op.h
+++ b/paddle/operators/margin_rank_loss_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/math/cross_entropy.cc b/paddle/operators/math/cross_entropy.cc
index 6011a196d446854877e162019f6745deb501ee9d..d9cb016fb440b6b2fe1d222812215feb5970dc4f 100644
--- a/paddle/operators/math/cross_entropy.cc
+++ b/paddle/operators/math/cross_entropy.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/math/cross_entropy.h"
 
diff --git a/paddle/operators/math/cross_entropy.cu b/paddle/operators/math/cross_entropy.cu
index 2132d49c937a85afeed0e0cee0a74a7e30c6a3ca..16c9e7b28ec8d453492455c8d620ba9edf130a07 100644
--- a/paddle/operators/math/cross_entropy.cu
+++ b/paddle/operators/math/cross_entropy.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/math/cross_entropy.h"
 
diff --git a/paddle/operators/math/cross_entropy.h b/paddle/operators/math/cross_entropy.h
index 677adb5adaf4041fe7acfd29be354073535fd5fc..b3b6d767a8b8f59e3c75e72ac6c98653a8e1c3a4 100644
--- a/paddle/operators/math/cross_entropy.h
+++ b/paddle/operators/math/cross_entropy.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
diff --git a/paddle/operators/math/detail/activation_functions.h b/paddle/operators/math/detail/activation_functions.h
index a20c35d1d9dc4a3a6fae92023fd1aae787a716ec..585a0123437a39c2b610306b18fe0a970c0ed072 100644
--- a/paddle/operators/math/detail/activation_functions.h
+++ b/paddle/operators/math/detail/activation_functions.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <math.h>
+#include "paddle/platform/enforce.h"
 #include "paddle/platform/hostdevice.h"
 
 #ifdef __AVX__
@@ -29,6 +30,26 @@ namespace detail {
 #define SIGMOID_THRESHOLD_MAX 13.0
 #define EXP_MAX_INPUT 40.0
 
+enum ActivationType {
+  kSigmoid,
+  kReLU,
+  kTanh,
+  kIdentity,
+};
+
+inline ActivationType GetActivationType(const std::string &type) {
+  if (type == "sigmoid") {
+    return ActivationType::kSigmoid;
+  } else if (type == "relu") {
+    return ActivationType::kReLU;
+  } else if (type == "tanh") {
+    return ActivationType::kTanh;
+  } else if (type == "identity" || type == "") {
+    return ActivationType::kIdentity;
+  }
+  PADDLE_THROW("Not support type %s.", type);
+}
+
 namespace forward {
 
 template <typename T>
diff --git a/paddle/operators/math/detail/lstm_cpu_kernel.h b/paddle/operators/math/detail/lstm_cpu_kernel.h
index a734ad31eea4816e952641bad73776d93d8c8d34..42888fcdb0a464892e3007ee73c195fcd2a431bb 100644
--- a/paddle/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_cpu_kernel.h
@@ -26,10 +26,9 @@ namespace detail {
 
 template <class T, class Op>
 void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                     int frame_size,
-                                     activation_mode_t active_node,
-                                     activation_mode_t active_gate,
-                                     activation_mode_t active_state) {
+                                     int frame_size, ActivationType active_node,
+                                     ActivationType active_gate,
+                                     ActivationType active_state) {
   T r_value_in;
   T r_value_ig;
   T r_value_fg;
@@ -77,9 +76,9 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 template <class T, class Op>
 void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
                                       LstmMetaGrad<T> grad, int frame_size,
-                                      activation_mode_t active_node,
-                                      activation_mode_t active_gate,
-                                      activation_mode_t active_state) {
+                                      ActivationType active_node,
+                                      ActivationType active_gate,
+                                      ActivationType active_state) {
   T r_value_in;
   T r_value_ig;
   T r_value_fg;
@@ -149,10 +148,9 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
 
 template <class T, class Op>
 void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                   int frame_size,
-                                   activation_mode_t active_node,
-                                   activation_mode_t active_gate,
-                                   activation_mode_t active_state) {
+                                   int frame_size, ActivationType active_node,
+                                   ActivationType active_gate,
+                                   ActivationType active_state) {
 #ifdef __AVX__
   __m256 r_value_in;
   __m256 r_value_ig;
@@ -204,9 +202,9 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 template <class T, class Op>
 void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
                                     LstmMetaGrad<T> grad, int frame_size,
-                                    activation_mode_t active_node,
-                                    activation_mode_t active_gate,
-                                    activation_mode_t active_state) {
+                                    ActivationType active_node,
+                                    ActivationType active_gate,
+                                    ActivationType active_state) {
 #ifdef __AVX__
   __m256 r_value_in;
   __m256 r_value_ig;
@@ -281,9 +279,8 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
 
 template <class T, class Op>
 void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate,
-                      activation_mode_t active_state) {
+                      ActivationType active_node, ActivationType active_gate,
+                      ActivationType active_state) {
   if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
     avx_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
                                      active_gate, active_state);
@@ -295,9 +292,9 @@ void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
 
 template <class T, class Op>
 void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frame_size, activation_mode_t active_node,
-                       activation_mode_t active_gate,
-                       activation_mode_t active_state) {
+                       int frame_size, ActivationType active_node,
+                       ActivationType active_gate,
+                       ActivationType active_state) {
   if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
     avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, active_node,
                                       active_gate, active_state);
diff --git a/paddle/operators/math/detail/lstm_gpu_kernel.h b/paddle/operators/math/detail/lstm_gpu_kernel.h
index 91bfedea53a2600156c9025f6ff3615d695a712b..e31e657e8b6964c2b99f6e456545c83d8da8e7f9 100644
--- a/paddle/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_gpu_kernel.h
@@ -31,9 +31,9 @@ namespace detail {
  */
 template <class T, class Op, bool is_batch>
 __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
-                              int batch_size, activation_mode_t active_node,
-                              activation_mode_t active_gate,
-                              activation_mode_t active_state) {
+                              int batch_size, ActivationType active_node,
+                              ActivationType active_gate,
+                              ActivationType active_state) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (frame_idx >= frame_size) return;
 
@@ -91,9 +91,9 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
 template <class T, class Op, bool is_batch>
 __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
                                LstmMetaGrad<T> grad, int frame_size,
-                               int batch_size, activation_mode_t active_node,
-                               activation_mode_t active_gate,
-                               activation_mode_t active_state) {
+                               int batch_size, ActivationType active_node,
+                               ActivationType active_gate,
+                               ActivationType active_state) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (frame_idx >= frame_size) return;
 
@@ -185,9 +185,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
 template <class T, class Op>
 void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate,
-                      activation_mode_t active_state) {
+                      ActivationType active_node, ActivationType active_gate,
+                      ActivationType active_state) {
   dim3 threads;
   dim3 grid;
   if (batch_size == 1) {
@@ -220,9 +219,8 @@ template <class T, class Op>
 void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
                        LstmMetaValue<T> value, LstmMetaGrad<T> grad,
                        int frame_size, int batch_size,
-                       activation_mode_t active_node,
-                       activation_mode_t active_gate,
-                       activation_mode_t active_state) {
+                       ActivationType active_node, ActivationType active_gate,
+                       ActivationType active_state) {
   dim3 threads;
   dim3 grid;
   if (batch_size == 1) {
diff --git a/paddle/operators/math/detail/lstm_kernel.h b/paddle/operators/math/detail/lstm_kernel.h
index 78f9a249a3d5d413452952edf990975c02f1a369..fed8f9c4ca48905ad4c524ba400e8c7bb2f7fbd1 100644
--- a/paddle/operators/math/detail/lstm_kernel.h
+++ b/paddle/operators/math/detail/lstm_kernel.h
@@ -30,9 +30,9 @@ class lstm {
   HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
                              T &prev_state, T &state, T &state_atv, T &output,
                              T &checkI, T &checkF, T &checkO,
-                             activation_mode_t active_node,
-                             activation_mode_t active_gate,
-                             activation_mode_t active_state) {
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
     value_in = activation(value_in, active_node);
     value_ig = activation(value_ig + prev_state * checkI, active_gate);
     value_fg = activation(value_fg + prev_state * checkF, active_gate);
@@ -53,9 +53,9 @@ class lstm {
                              __m256 &prev_state, __m256 &state,
                              __m256 &state_atv, __m256 &output, __m256 &checkI,
                              __m256 &checkF, __m256 &checkO,
-                             activation_mode_t active_node,
-                             activation_mode_t active_gate,
-                             activation_mode_t active_state) {
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
     value_in = activation(value_in, active_node);
     value_ig =
         activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)),
@@ -87,9 +87,9 @@ class lstm {
                              T &state_grad, T &state_atv, T &output_grad,
                              T &checkI, T &checkF, T &checkO, T &checkIGrad,
                              T &checkFGrad, T &checkOGrad,
-                             activation_mode_t active_node,
-                             activation_mode_t active_gate,
-                             activation_mode_t active_state) {
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
     grad_og = activation(output_grad * state_atv, value_og, active_gate);
     state_grad += activation(output_grad * value_og, state_atv, active_state) +
                   grad_og * checkO;
@@ -114,8 +114,8 @@ class lstm {
       __m256 &prev_state, __m256 &prev_state_grad, __m256 &state,
       __m256 &state_grad, __m256 &state_atv, __m256 &output_grad,
       __m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad,
-      __m256 &checkFGrad, __m256 &checkOGrad, activation_mode_t active_node,
-      activation_mode_t active_gate, activation_mode_t active_state) {
+      __m256 &checkFGrad, __m256 &checkOGrad, ActivationType active_node,
+      ActivationType active_gate, ActivationType active_state) {
     grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og,
                          active_gate);
     state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og),
diff --git a/paddle/operators/math/im2col.cc b/paddle/operators/math/im2col.cc
index 707ebf05962fb65892c2adbbf41a0a3449763d31..c2633b2e16434558d16f699a701e7b8cf1de8342 100644
--- a/paddle/operators/math/im2col.cc
+++ b/paddle/operators/math/im2col.cc
@@ -61,14 +61,13 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
 
     const T* im_data = im.data<T>();
     T* col_data = col->data<T>();
-
     for (int c = 0; c < channels_col; ++c) {
       int w_offset = c % filter_width;
       int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / filter_width / filter_height;
+      int c_im = c / (filter_width * filter_height);
       for (int h = 0; h < col_height; ++h) {
+        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
         for (int w = 0; w < col_width; ++w) {
-          int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
           int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
           int col_idx = (c * col_height + h) * col_width + w;
           int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
@@ -130,16 +129,14 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
     for (int c = 0; c < channels_col; ++c) {
       int w_offset = c % filter_width;
       int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / filter_width / filter_height;
+      int c_im = c / (filter_width * filter_height);
       for (int h = 0; h < col_height; ++h) {
+        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
         for (int w = 0; w < col_width; ++w) {
-          int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
           int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-
           if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
               (im_col_idx) >= 0 && (im_col_idx) < im_width) {
-            im_row_idx += c_im * im_height;
-            im_data[im_row_idx * im_width + im_col_idx] +=
+            im_data[(im_row_idx + c_im * im_height) * im_width + im_col_idx] +=
                 col_data[(c * col_height + h) * col_width + w];
           }
         }
@@ -199,12 +196,13 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
         for (int channel = 0; channel < im_channels; ++channel) {
           for (int filter_row_idx = 0; filter_row_idx < filter_height;
                ++filter_row_idx) {
+            int im_row_offset =
+                col_row_idx * stride[0] + filter_row_idx - padding[0];
             for (int filter_col_idx = 0; filter_col_idx < filter_width;
                  ++filter_col_idx) {
-              int im_row_offset =
-                  col_row_idx * stride[0] + filter_row_idx - padding[0];
               int im_col_offset =
                   col_col_idx * stride[1] + filter_col_idx - padding[1];
+
               int col_offset =
                   ((((col_row_idx)*col_width + col_col_idx) * im_channels +
                     channel) *
@@ -271,12 +269,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
         for (int channel = 0; channel < im_channels; ++channel) {
           for (int filter_row_idx = 0; filter_row_idx < filter_height;
                ++filter_row_idx) {
+            int im_row_offset =
+                col_row_idx * stride[0] + filter_row_idx - padding[0];
             for (int filter_col_idx = 0; filter_col_idx < filter_width;
                  ++filter_col_idx) {
-              int im_row_offset =
-                  col_row_idx * stride[0] + filter_row_idx - padding[0];
               int im_col_offset =
                   col_col_idx * stride[1] + filter_col_idx - padding[1];
+
               int col_offset =
                   (((col_row_idx * col_width + col_col_idx) * im_channels +
                     channel) *
@@ -284,6 +283,7 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                    filter_row_idx) *
                       filter_width +
                   filter_col_idx;
+
               if (im_row_offset >= 0 && im_row_offset < im_height &&
                   im_col_offset >= 0 && im_col_offset < im_width) {
                 int im_offset =
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc
index 256f3bc9bd487d11b0f139ef057f5a98556b4db1..26c038e435827b401d723ee6eef2255a89670f46 100644
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@@ -159,6 +159,7 @@ void testIm2col() {
 TEST(math, im2col) {
   testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #ifdef PADDLE_WITH_CUDA
-  testIm2col<paddle::platform::CUDADeviceContext, paddle::platform::GPUPlace>();
+  testIm2col<paddle::platform::CUDADeviceContext,
+             paddle::platform::CUDAPlace>();
 #endif
 }
diff --git a/paddle/operators/math/lstm_compute.cc b/paddle/operators/math/lstm_compute.cc
index 2c2e8bb82e6f51e21a00de53bbfce5f0b4868e27..d453102ecefc9d79e1f4474ba94be0eb69a87c85 100644
--- a/paddle/operators/math/lstm_compute.cc
+++ b/paddle/operators/math/lstm_compute.cc
@@ -24,12 +24,12 @@ template <class T>
 struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
   static void compute(const platform::CPUDeviceContext& context,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
-                      const std::string& gate_act, const std::string& cell_act,
-                      const std::string& cand_act) {
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
     for (int b = 0; b < batch_size; b++) {
       detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
-                               ActiveType(cand_act), ActiveType(gate_act),
-                               ActiveType(cell_act));
+                               cand_act, gate_act, cell_act);
       value.gate_value += frame_size * 4;
       value.state_value += frame_size;
       value.state_active_value += frame_size;
@@ -46,12 +46,12 @@ struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
   static void compute(const platform::CPUDeviceContext& context,
                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
                       int frame_size, int batch_size,
-                      const std::string& gate_act, const std::string& cell_act,
-                      const std::string& cand_act) {
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
     for (int b = 0; b < batch_size; b++) {
       detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
-                                frame_size, ActiveType(cand_act),
-                                ActiveType(gate_act), ActiveType(cell_act));
+                                frame_size, cand_act, gate_act, cell_act);
 
       value.gate_value += frame_size * 4;
       value.state_value += frame_size;
diff --git a/paddle/operators/math/lstm_compute.cu b/paddle/operators/math/lstm_compute.cu
index 92b1f4228b49709d2903fab518e7649133932fad..82065d699f760db6cc86bf3d6c56e51c583c6ace 100644
--- a/paddle/operators/math/lstm_compute.cu
+++ b/paddle/operators/math/lstm_compute.cu
@@ -24,11 +24,12 @@ template <class T>
 struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
   static void compute(const platform::CUDADeviceContext& context,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
-                      const std::string& gate_act, const std::string& cell_act,
-                      const std::string& cand_act) {
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
     detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
-                                frame_size, batch_size, ActiveType(cand_act),
-                                ActiveType(gate_act), ActiveType(cell_act));
+                                frame_size, batch_size, cand_act, gate_act,
+                                cell_act);
   }
 };
 
@@ -37,11 +38,12 @@ struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
   static void compute(const platform::CUDADeviceContext& context,
                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
                       int frame_size, int batch_size,
-                      const std::string& gate_act, const std::string& cell_act,
-                      const std::string& cand_act) {
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
     detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
-                              frame_size, batch_size, ActiveType(cand_act),
-                              ActiveType(gate_act), ActiveType(cell_act));
+                              frame_size, batch_size, cand_act, gate_act,
+                              cell_act);
   }
 };
 
diff --git a/paddle/operators/math/lstm_compute.h b/paddle/operators/math/lstm_compute.h
index 5f74e273585aea5184281bf294df694235150e30..954762f92286fe13bd2c08ec03c3ac96bb663cca 100644
--- a/paddle/operators/math/lstm_compute.h
+++ b/paddle/operators/math/lstm_compute.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/operators/math/detail/activation_functions.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 
@@ -72,8 +73,9 @@ class LstmUnitFunctor {
  public:
   static void compute(const DeviceContext &context, LstmMetaValue<T> value,
                       int frame_size, int batch_size,
-                      const std::string &gate_act, const std::string &cell_act,
-                      const std::string &cand_act);
+                      const detail::ActivationType &gate_act,
+                      const detail::ActivationType &cell_act,
+                      const detail::ActivationType &cand_act);
 };
 
 template <typename DeviceContext, typename T>
@@ -81,8 +83,9 @@ class LstmUnitGradFunctor {
  public:
   static void compute(const DeviceContext &context, LstmMetaValue<T> value,
                       LstmMetaGrad<T> grad, int frame_size, int batch_size,
-                      const std::string &gate_act, const std::string &cell_act,
-                      const std::string &cand_act);
+                      const detail::ActivationType &gate_act,
+                      const detail::ActivationType &cell_act,
+                      const detail::ActivationType &cand_act);
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index a05810d7781f5286e70b53005ef0b193c945c54c..d4f12f0a106e077ac31aa37f46857b74e1e99b59 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -277,14 +277,6 @@ void set_constant_with_place<platform::CPUPlace>(
                            TensorSetConstantCPU(tensor, value));
 }
 
-template <>
-void set_constant_with_place<platform::MKLDNNPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(framework::ToDataType(tensor->type()),
-                           TensorSetConstantCPU(tensor, value));
-}
-
 struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
   TensorSetConstantWithPlace(const platform::DeviceContext& context,
                              framework::Tensor* tensor, float value)
@@ -310,8 +302,29 @@ void set_constant(const platform::DeviceContext& context,
 #endif
 }
 
+template <typename T>
+struct RowwiseAdd<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& vector, framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(vector.numel(), size);
+    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+
+    auto in = framework::EigenMatrix<T>::From(input);
+    auto vec = framework::EigenVector<T>::Flatten(vector);
+    auto out = framework::EigenMatrix<T>::From(*output);
+
+    for (int64_t i = 0; i < in_dims[0]; ++i) {
+      out.chip(i, 0) = in.chip(i, 0) + vec;
+    }
+  }
+};
+
 template struct RowwiseAdd<platform::CPUDeviceContext, float>;
 template struct RowwiseAdd<platform::CPUDeviceContext, double>;
+
 template struct ColwiseSum<platform::CPUDeviceContext, float>;
 template struct ColwiseSum<platform::CPUDeviceContext, double>;
 
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index e33070c40fbfa7f2794426247ef77b8fcaee4ec6..d47a7f818ded61baf31e46ea3b8ae3101324111f 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -105,7 +105,7 @@ void matmul<platform::CUDADeviceContext, float>(
   PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
                      platform::is_gpu_place(matrix_b.place()) &&
                      platform::is_gpu_place(matrix_out->place()),
-                 "Matrix must all be in GPUPlace");
+                 "Matrix must all be in CUDAPlace");
 
   int M = dim_out[0];
   int N = dim_out[1];
@@ -134,7 +134,7 @@ void matmul<platform::CUDADeviceContext, double>(
   PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
                      platform::is_gpu_place(matrix_b.place()) &&
                      platform::is_gpu_place(matrix_out->place()),
-                 "Matrix must all be in GPUPlace");
+                 "Matrix must all be in CUDAPlace");
 
   int M = dim_out[0];
   int N = dim_out[1];
@@ -266,20 +266,42 @@ struct TensorSetConstantGPU {
 };
 
 template <>
-void set_constant_with_place<platform::GPUPlace>(
+void set_constant_with_place<platform::CUDAPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
     float value) {
   framework::VisitDataType(framework::ToDataType(tensor->type()),
                            TensorSetConstantGPU(context, tensor, value));
 }
 
-template <>
-void set_constant_with_place<platform::CudnnPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  set_constant_with_place<platform::GPUPlace>(context, tensor, value);
+template <typename T>
+__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width,
+                                 int num) {
+  T tmp = 1.0 / width;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    int h = i * tmp;
+    int w = i - h * width;
+    c[i] = a[i] + b[w];
+  }
 }
 
+template <typename T>
+struct RowwiseAdd<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& vector, framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(vector.numel(), size);
+    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    int blocks = 512;
+    int grids = (input.numel() + blocks - 1) / blocks;
+    RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
+        input.data<T>(), vector.data<T>(), output->data<T>(),
+        static_cast<int>(in_dims[1]), static_cast<int>(input.numel()));
+  }
+};
+
 template struct RowwiseAdd<platform::CUDADeviceContext, float>;
 template struct RowwiseAdd<platform::CUDADeviceContext, double>;
 template struct ColwiseSum<platform::CUDADeviceContext, float>;
diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h
index 3e6d83386589a02c7d8f62394c1c2becb606504c..de591626df28e2bc3391b609f909612411398247 100644
--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
@@ -46,38 +46,46 @@ void Transpose<DeviceContext, T, Rank>::operator()(
 }
 
 template <typename DeviceContext, typename T>
-void RowwiseAdd<DeviceContext, T>::operator()(const DeviceContext& context,
+void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                               const framework::Tensor& input,
-                                              const framework::Tensor& vector,
-                                              framework::Tensor* output) {
+                                              framework::Tensor* out) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector.numel(), size);
-  PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+  PADDLE_ENFORCE_EQ(out->numel(), size);
 
   auto in = framework::EigenMatrix<T>::From(input);
-  auto vec = framework::EigenMatrix<T>::From(vector);
-  auto out = framework::EigenMatrix<T>::From(*output);
-  Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
-  Eigen::array<int, 2> bcast({{static_cast<int>(in_dims[0]), 1}});
-  out.device(*context.eigen_device()) =
-      in + vec.reshape(shape).broadcast(bcast);
+  auto vec = framework::EigenVector<T>::Flatten(*out);
+
+  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
 }
 
-template <typename DeviceContext, typename T>
-void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
-                                              const framework::Tensor& input,
-                                              framework::Tensor* vector) {
-  auto in_dims = input.dims();
-  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), size);
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// colwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class ColwiseSum<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), size);
 
-  auto vec = framework::EigenMatrix<T>::From(*vector);
-  auto in = framework::EigenMatrix<T>::From(input);
-  Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
-  vec.reshape(shape).device(*context.eigen_device()) =
-      in.sum(Eigen::array<int, 1>({{0}})).reshape(shape);
-}
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+        if (i == 0) {
+          out_buf[j] = in_buf[i * size + j];
+        } else {
+          out_buf[j] += in_buf[i * size + j];
+        }
+      }
+    }
+  }
+};
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu
index 32e96d948714a8fd1fa2c089057603fdaed85c16..4325a79664f15cfaea48870cd503ce70cc31044f 100644
--- a/paddle/operators/math/math_function_test.cu
+++ b/paddle/operators/math/math_function_test.cu
@@ -13,7 +13,7 @@ TEST(math_function, notrans_mul_trans) {
   float arr[6] = {0, 1, 2, 3, 4, 5};
   memcpy(input1_ptr, arr, 6 * sizeof(float));
 
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
   paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
@@ -47,7 +47,7 @@ TEST(math_function, trans_mul_notrans) {
   float arr[6] = {0, 1, 2, 3, 4, 5};
   memcpy(input1_ptr, arr, 6 * sizeof(float));
 
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
   paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
@@ -96,7 +96,7 @@ TEST(math_function, gemm_notrans_cublas) {
   float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
   paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
@@ -151,7 +151,7 @@ TEST(math_function, gemm_trans_cublas) {
   float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
   paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
@@ -189,7 +189,7 @@ void GemvTest(int m, int n, bool trans) {
   T* data_b = vec_b.mutable_data<T>({trans ? m : n}, *cpu_place);
   T* data_c = vec_c.mutable_data<T>({trans ? n : m}, *cpu_place);
 
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
   paddle::framework::Tensor g_mat_a;
   paddle::framework::Tensor g_vec_b;
   paddle::framework::Tensor g_vec_c;
diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu
index c44577e00af5f362ae7e168495e496d60d05de95..9fddd97a36f7fdb6628d6eeb192cb216fdae3e5b 100644
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -58,15 +58,15 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE(platform::is_gpu_place(out_place));
 
     memory::Copy(
-        boost::get<platform::GPUPlace>(out_place), out_data,
-        boost::get<platform::GPUPlace>(in1_place), in1_data,
+        boost::get<platform::CUDAPlace>(out_place), out_data,
+        boost::get<platform::CUDAPlace>(in1_place), in1_data,
         in1_value.numel() * sizeof(T),
         reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
 
     auto* in2_data = in2_value.data<T>();
-    memory::Copy(boost::get<platform::GPUPlace>(out_place),
+    memory::Copy(boost::get<platform::CUDAPlace>(out_place),
                  out_data + in1_value.numel(),
-                 boost::get<platform::GPUPlace>(in2_place), in2_data,
+                 boost::get<platform::CUDAPlace>(in2_place), in2_data,
                  in2_value.numel() * sizeof(T), context.stream());
   }
 };
@@ -160,9 +160,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
-    memory::Copy(boost::get<platform::GPUPlace>(in2_place),
+    memory::Copy(boost::get<platform::CUDAPlace>(in2_place),
                  in2_data + input2_offset,
-                 boost::get<platform::GPUPlace>(in1_place), in1_data,
+                 boost::get<platform::CUDAPlace>(in1_place), in1_data,
                  in1_value.numel() * sizeof(T), context.stream());
   }
 };
diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu
index 777caf5635647d11e8fde05a68fdf7e2c32f48df..0a2e36f68acee04bd6b272d37679c18231cb8760 100644
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/operators/math/selected_rows_functor_test.cu
@@ -21,7 +21,7 @@ TEST(selected_rows_functor, gpu_add) {
   using namespace paddle::platform;
   using namespace paddle::operators::math;
 
-  GPUPlace gpu_place(0);
+  CUDAPlace gpu_place(0);
   CPUPlace cpu_place;
   CUDADeviceContext ctx(gpu_place);
   SetConstant<CUDADeviceContext, float> functor;
@@ -119,7 +119,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   using namespace paddle::platform;
   using namespace paddle::operators::math;
 
-  GPUPlace gpu_place(0);
+  CUDAPlace gpu_place(0);
   CPUPlace cpu_place;
   CUDADeviceContext ctx(gpu_place);
   SetConstant<CUDADeviceContext, float> functor;
diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc
index f46db3c56713399798a45854bf1613d07aee26e6..3794f0e52d200a08253a979991da04ec564cae47 100644
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
@@ -122,6 +122,6 @@ TEST(math, vol2col) {
   testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #ifdef PADDLE_WITH_CUDA
   testVol2col<paddle::platform::CUDADeviceContext,
-              paddle::platform::GPUPlace>();
+              paddle::platform::CUDAPlace>();
 #endif  // PADDLE_WITH_CUDA
 }
diff --git a/paddle/operators/matmul_op.cc b/paddle/operators/matmul_op.cc
index ee0bc0c3708ac20ad00e3222060244d42dbd6f2f..fd65d894d5749c97f860d614de354e89f6d9441d 100644
--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
@@ -130,7 +130,7 @@ class MatMulOp : public framework::OperatorWithKernel {
 
 class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MatMulOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MatMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The first input of MatMul op");
     AddInput("Y", "The second input of MatMul op");
diff --git a/paddle/operators/matmul_op.cu.cc b/paddle/operators/matmul_op.cu.cc
index 6a3772c00457993dcc7b55a0f15493974633026c..d28d12164e493786c5bdafb157795d797ee87b91 100644
--- a/paddle/operators/matmul_op.cu.cc
+++ b/paddle/operators/matmul_op.cu.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/matmul_op.h"
 
diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h
index de9da487b3d627cc79962db3770632813e9cd9f5..78adc64f76f45afce64c49bcf734647e0db2d6b3 100644
--- a/paddle/operators/matmul_op.h
+++ b/paddle/operators/matmul_op.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   You may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/max_sequence_len_op.cc b/paddle/operators/max_sequence_len_op.cc
index 798022c9dd904a0ac189b4b550a94264a433ebf2..019150e4914e8bd34a5e8b7d37318aee43942fcc 100644
--- a/paddle/operators/max_sequence_len_op.cc
+++ b/paddle/operators/max_sequence_len_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/op_registry.h"
@@ -28,7 +28,7 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto &rank_table =
         scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
     auto *out =
@@ -40,8 +40,7 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
 
 class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxSeqenceLenOpProtoMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  MaxSeqenceLenOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("RankTable", "The lod_rank_table.");
     AddOutput("Out", "The max sequence length.");
diff --git a/paddle/operators/maxout_op.cc b/paddle/operators/maxout_op.cc
index 011616e615a36efa0efe9ff15e678f1486c5177a..3ee32269417e80cd14a6ff0f8e52c0b2dec4b8be 100644
--- a/paddle/operators/maxout_op.cc
+++ b/paddle/operators/maxout_op.cc
@@ -20,7 +20,7 @@ using framework::Tensor;
 
 class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxOutOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MaxOutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
diff --git a/paddle/operators/maxout_op.cu.cc b/paddle/operators/maxout_op.cu.cc
index 2904f0ff96f06cefad29a65898cd82107d9bd600..c4a2d676d3aca4d59d0bfa8c75aa0c249e202ab5 100644
--- a/paddle/operators/maxout_op.cu.cc
+++ b/paddle/operators/maxout_op.cu.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/maxout_op.h"
 
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index 8932d700c2ae17eefe919eefae2282ae4a5a80a8..411f4d14efbfa5a8ee6dd7da645a044b191bf006 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -32,7 +32,7 @@ class MeanOp : public framework::OperatorWithKernel {
 
 class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MeanOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MeanOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of mean op");
     AddOutput("Out", "The output of mean op");
@@ -60,13 +60,13 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto* grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
     grad_op->SetType("mean_grad");
     grad_op->SetInput("X", Input("X"));
     grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/mean_op.cu b/paddle/operators/mean_op.cu
index 93062bf540ad64350f7ee9a554c3c469aba46677..212d4481138c1478f6e3aa684008f9e42c5a3870 100644
--- a/paddle/operators/mean_op.cu
+++ b/paddle/operators/mean_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 
diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc
index adc688dbd5e13a2203d6842a12acdb8625288275..3f999e404f8afe6bded09c820509fa0f36d30bf6 100644
--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -28,7 +28,11 @@ class MergeLoDTensorOp : public framework::OperatorBase {
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
     auto &in_true = scope.FindVar(Input("InTrue"))->Get<framework::LoDTensor>();
@@ -114,8 +118,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {
 
 class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MergeLoDTensorOpProtoMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+  MergeLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input LoDTensor, contains complete lod information to "
@@ -162,15 +165,15 @@ class MergeLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("split_lod_tensor");
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetInput("Mask", Input("Mask"));
     grad_op->SetOutput("OutTrue", InputGrad("InTrue"));
     grad_op->SetOutput("OutFalse", InputGrad("InFalse"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index 27f0c8de2053064e65d9984ec9bd4242fee48e5f..3d7742dd4bc2a3c727279bc1e6c7dd47b96eefa3 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/minus_op.h"
 #include "paddle/operators/net_op.h"
@@ -46,7 +46,7 @@ class MinusOp : public framework::OperatorWithKernel {
 
 class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MinusOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The left tensor of minus operator.");
     AddInput("Y", "The right tensor of minus operator.");
@@ -70,12 +70,11 @@ class MinusGradMaker : public framework::GradOpDescMakerBase {
  public:
   using framework::GradOpDescMakerBase::GradOpDescMakerBase;
 
-  std::vector<std::unique_ptr<framework::OpDescBind>> operator()()
-      const override {
-    std::vector<std::unique_ptr<framework::OpDescBind>> ops;
+  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
+    std::vector<std::unique_ptr<framework::OpDesc>> ops;
     auto x_g = InputGrad("X");
     if (!x_g.empty()) {
-      auto *x_g_op = new framework::OpDescBind();
+      auto *x_g_op = new framework::OpDesc();
       x_g_op->SetType("scale");
       x_g_op->SetInput("X", OutputGrad("Out"));
       x_g_op->SetOutput("Out", x_g);
@@ -85,7 +84,7 @@ class MinusGradMaker : public framework::GradOpDescMakerBase {
 
     auto y_g = InputGrad("Y");
     if (!y_g.empty()) {
-      auto *y_g_op = new framework::OpDescBind();
+      auto *y_g_op = new framework::OpDesc();
       y_g_op->SetType("scale");
       y_g_op->SetInput("X", OutputGrad("Out"));
       y_g_op->SetOutput("Out", y_g);
diff --git a/paddle/operators/minus_op.cu b/paddle/operators/minus_op.cu
index 3b202ea92ee8692f2441909083f559adff5fea8c..80cd9f7c16845904b7b46ae1597ce9558c32f46a 100644
--- a/paddle/operators/minus_op.cu
+++ b/paddle/operators/minus_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/minus_op.h"
 
diff --git a/paddle/operators/minus_op.h b/paddle/operators/minus_op.h
index 78e1e1be6d622d504db9e664dcb5f35ca0c22b95..20760b8cd5bd2f74ed8469addda8f67f11f4545c 100644
--- a/paddle/operators/minus_op.h
+++ b/paddle/operators/minus_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc
index f0a42491bf04a5bbe2de10de2f702877c9a2f839..f5d69071a86e3f8037840c091cf5b7683e4eeb96 100644
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/modified_huber_loss_op.h"
 
@@ -39,8 +39,7 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
 
 class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ModifiedHuberLossOpMaker(framework::OpProto* proto,
-                           framework::OpAttrChecker* op_checker)
+  ModifiedHuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input tensor of modified huber loss op. "
diff --git a/paddle/operators/modified_huber_loss_op.cu b/paddle/operators/modified_huber_loss_op.cu
index 40a8447da4d9d4874af232f3408557c950b58482..3d2a5562e8cc2117b0b460496d9ba8e96823fbfb 100644
--- a/paddle/operators/modified_huber_loss_op.cu
+++ b/paddle/operators/modified_huber_loss_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
 
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <thrust/for_each.h>
 #include <thrust/tuple.h>
 #include "paddle/framework/op_registry.h"
diff --git a/paddle/operators/modified_huber_loss_op.h b/paddle/operators/modified_huber_loss_op.h
index 157ae0682e0cf4392dab003153d44f48209d00a1..6ce86feee574efca8811f316f47f1c3fbbdd0bf9 100644
--- a/paddle/operators/modified_huber_loss_op.h
+++ b/paddle/operators/modified_huber_loss_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc
index 2ab48fedecf0cce95dcf4d0593dcd4b30bc1f505..15b8b80776732f43c3ef4f8b80cffedf5c2a76fd 100644
--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
@@ -54,8 +54,7 @@ class MomentumOp : public framework::OperatorWithKernel {
 
 class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MomentumOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  MomentumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
diff --git a/paddle/operators/momentum_op.cu b/paddle/operators/momentum_op.cu
index 00f1253465d336e0fad580d0c6b898369e4783ca..2b9314162e6f10b3791c913203c732d2822861ab 100644
--- a/paddle/operators/momentum_op.cu
+++ b/paddle/operators/momentum_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
 
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index bc4a5fdf0b37ce07b4c07bba9e1af5611d2be7e3..c923e988a55b43ebb7ba6256e7b72a85c124f360 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -71,41 +71,52 @@ class MulOpShapeInference : public framework::InferShapeBase {
 
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MulOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of mul op");
-    AddInput("Y", "The second input of mul op");
-    AddOutput("Out", "The output of mul op");
+    AddInput("X", "(Tensor), The first input tensor of mul op.");
+    AddInput("Y", "(Tensor), The second input tensor of mul op.");
+    AddOutput("Out", "(Tensor), The output tensor of mul op.");
     AddAttr<int>(
         "x_num_col_dims",
-        "(int, default 1) "
-        R"DOC(mul_op can take tensors with more than two dimensions as input `X`,
-            in that case, tensors will be reshaped to a matrix. The matrix's first
-            dimension(column length) will be the product of tensor's last
-            `num_col_dims` dimensions, and the matrix's second dimension(row length)
-            will be the product of tensor's first `rank - num_col_dims` dimensions.
+        R"DOC((int, default 1), The mul_op can take tensors with more than two
+              dimensions as its inputs. If the input $X$ is a tensor with more
+              than two dimensions, $X$ will be flattened into a two-dimensional
+              matrix first. The flattening rule is: the first `num_col_dims`
+              will be flattened to form the first dimension of the final matrix
+              (the height of the matrix), and the rest `rank(X) - num_col_dims`
+              dimensions are flattened to form the second dimension of the final
+              matrix (the width of the matrix). As a result, height of the
+              flattened matrix is equal to the product of $X$'s first
+              `x_num_col_dims` dimensions' sizes, and width of the flattened
+              matrix is equal to the product of $X$'s last `rank(x) - num_col_dims`
+              dimensions' size. For example, suppose $X$ is a 6-dimensional
+              tensor with the shape [2, 3, 4, 5, 6], and `x_num_col_dims` = 3.
+              Thus, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] =
+              [24, 30].
         )DOC")
         .SetDefault(1)
         .EqualGreaterThan(1);
     AddAttr<int>(
         "y_num_col_dims",
-        "(int, default 1) "
-        R"DOC(mul_op can take tensors with more than two dimensions as input `Y`,
-             in that case, tensors will be reshaped to a matrix. Just like input `X`.
+        R"DOC((int, default 1), The mul_op can take tensors with more than two,
+              dimensions as its inputs. If the input $Y$ is a tensor with more
+              than two dimensions, $Y$ will be flattened into a two-dimensional
+              matrix first. The attribute `y_num_col_dims` determines how $Y$ is
+              flattened. See comments of `x_num_col_dims` for more details.
         )DOC")
         .SetDefault(1)
         .EqualGreaterThan(1);
     AddComment(R"DOC(
-Mul Operator. 
+Mul Operator.
 
-This operator is used to perform matrix multiplication for input X and Y.
+This operator is used to perform matrix multiplication for input $X$ and $Y$.
 
 The equation is:
 
-    $$Out = X * Y$$
+$$Out = X * Y$$
 
-Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input `X`.
+Both the input $X$ and $Y$ can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input $X$.
 
 )DOC");
   }
diff --git a/paddle/operators/mul_op.cu.cc b/paddle/operators/mul_op.cu.cc
index 6095de58d0c58be6b647771e9784348cbf8c4ad4..43de9a719499e4e0e8fd2e5fcc6771d717ce6522 100644
--- a/paddle/operators/mul_op.cu.cc
+++ b/paddle/operators/mul_op.cu.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/mul_op.h"
 
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index 1b467dca8302c10fe08a157aac4586230e096dd0..1fb0569b49cce80c3f1e408fb57b5f5cf7033a27 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   You may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index b1ee8051c4c48f575690b38142ae082930fe2070..11e047b5d57b6bc18e6d6f4a1d122e18dfc6e357 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/multiplex_op.h"
 
@@ -51,7 +51,7 @@ class MultiplexOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
@@ -61,8 +61,7 @@ class MultiplexOp : public framework::OperatorWithKernel {
 
 class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MultiplexOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  MultiplexOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Ids", "The index tensor of multiplex operator.");
     AddInput("X", "The candidate tensors of multiplex operator.")
@@ -103,7 +102,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index 47986e9ff86f2e08b0861cde35ac3a44b10caed1..f49ee71f104b72f5c8ea5fb1d49999528c21832e 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/multiplex_op.h"
@@ -36,7 +36,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
     CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
     auto stream = ctx.cuda_device_context().stream();
-    platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
       int32_t k = index[i];
       PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
@@ -73,7 +73,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
     auto* index = index_t_cpu.data<int32_t>();
 
     auto stream = ctx.cuda_device_context().stream();
-    platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
       size_t k = static_cast<size_t>(index[i]);
       if (d_ins[k]) {
diff --git a/paddle/operators/multiplex_op.h b/paddle/operators/multiplex_op.h
index 344315116122f7ad843af740be8a31313c8a0342..ef66be5556ee613a037de13286ecc66b53885c1f 100644
--- a/paddle/operators/multiplex_op.h
+++ b/paddle/operators/multiplex_op.h
@@ -1,17 +1,16 @@
-
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/nccl/nccl_gpu_common.cc b/paddle/operators/nccl/nccl_gpu_common.cc
index 6be735e4c731f79684e0bdac3d69a30b328fed84..1602a3d9b54dd64813770a7162f8d4f3dd0e791a 100644
--- a/paddle/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/operators/nccl/nccl_gpu_common.cc
@@ -1,13 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/nccl/nccl_gpu_common.h"
 #include "paddle/platform/gpu_info.h"
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index 48e322f99398a7f1d6af9cab653d0cc92d981fe0..5173996f2020ec7a94643277e8c7a532d41d9045 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index 22a37ff1bbf6b8cfb2cbc3c3dbbb20a87c5ea4e7..9d51153b0631b988c9297f395672be67e18ee3f9 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -1,13 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/nccl/nccl_gpu_common.h"
@@ -24,7 +27,7 @@ class NCCLInitOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     const auto &name = Output("Communicator");
     PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
                             "Can not find variable '%s' in the scope.", name);
@@ -43,8 +46,7 @@ class NCCLInitOp : public framework::OperatorBase {
 
 class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLInitOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  NCCLInitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Communicator",
               "Create Communicator for communicating between gpus");
@@ -52,7 +54,7 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
     AddComment(R"DOC(
 NCCLInit Operator.
 
@@ -141,8 +143,7 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
 // AllreduceOp
 class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLAllReduceOpMaker(framework::OpProto *proto,
-                       framework::OpAttrChecker *op_checker)
+  NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of AllReduce op");
     AddInput("Communicator", "Communicator for communicating between gpus");
@@ -163,8 +164,7 @@ AllReduce the input tensors.
 // ReduceOp
 class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLReduceOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
+  NCCLReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of Reduce op");
     AddInput("Communicator", "Communicator for communicating between gpus");
@@ -190,8 +190,7 @@ Reduce the tensors.
 // BcastOp
 class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLBcastOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  NCCLBcastOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of BcastSend op");
     AddInput("Communicator", "Communicator for communicating between gpus");
diff --git a/paddle/operators/nccl_op.cu.cc b/paddle/operators/nccl_op.cu.cc
index 6ca6db7253da0e59c742f115cd25a1b8203a3044..1b986a13650de7d77f4828d71798ee00d61c1284 100644
--- a/paddle/operators/nccl_op.cu.cc
+++ b/paddle/operators/nccl_op.cu.cc
@@ -67,7 +67,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
     auto stream = ctx.cuda_device_context().stream();
 
     // device id
-    int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = comm->GetCommId(gpu_id);
 
     for (size_t i = 0; i < ins.size(); ++i) {
@@ -120,7 +120,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
                       ctx.device_context())
                       .stream();
     // device id
-    int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = comm->GetCommId(gpu_id);
 
     auto ins_names = ctx.Inputs("X");
@@ -164,7 +164,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
                       ctx.device_context())
                       .stream();
     // device id
-    int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = comm->GetCommId(gpu_id);
 
     if (idx == root) {
diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc
index d747cc0cf5f74b886bbd40549673e7d64de952e9..6546096069d4c3fbc4908a16c2dba2ac6d7e6421 100644
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -22,6 +22,7 @@
 #include <vector>
 
 #include "paddle/framework/block_desc.h"
+#include "paddle/framework/init.h"
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/program_desc.h"
@@ -49,9 +50,9 @@ const f::DDim kDims = {100, 100};
 class NCCLTester : public ::testing::Test {
  public:
   virtual void SetUp() override {
-    cpu_ctx = new p::CPUDeviceContext(p::CPUPlace());
+    paddle::platform::CPUPlace cpu_place;
     for (size_t i = 0; i < gpu_list.size(); ++i) {
-      p::GPUPlace place(i);
+      p::CUDAPlace place(i);
       dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
     }
 
@@ -65,7 +66,8 @@ class NCCLTester : public ::testing::Test {
   }
 
   void NCCLInitOp() {
-    std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
+    paddle::platform::CPUPlace cpu_place;
+    std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
 
     op1->SetType("ncclInit");
     op1->SetOutput("Communicator", {"comm"});
@@ -76,17 +78,16 @@ class NCCLTester : public ::testing::Test {
 
     auto op = f::OpRegistry::CreateOp(*op1);
     VLOG(1) << "invoke NCCLInitOp.";
-    op->Run(g_scope, *cpu_ctx);
+    op->Run(g_scope, cpu_place);
     VLOG(1) << "NCCLInitOp finished.";
   }
 
   template <class T>
-  void PerThreadProgram(int gpu_id, const f::OpDescBind &op_desc,
-                        f::Scope *scope) {
+  void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
     std::unique_lock<std::mutex> lk(mu);
-    const f::OpDescBind *op1 = &op_desc;
+    const f::OpDesc *op1 = &op_desc;
 
-    p::GPUPlace place(gpu_id);
+    p::CUDAPlace place(gpu_id);
     auto &ctx = dev_ctxs.at(gpu_id);
 
     auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
@@ -112,40 +113,39 @@ class NCCLTester : public ::testing::Test {
     VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
     VLOG(1) << " send_tensor : " << send_tensor->numel()
             << " recv_tensor : " << recv_tensor->numel();
-    op->Run(*scope, *ctx);
+    op->Run(*scope, place);
     VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
   }
 
  public:
   std::vector<p::DeviceContext *> dev_ctxs;
-  p::DeviceContext *cpu_ctx;
   f::Scope g_scope;
   std::mutex mu;
 };
 
 // ncclInitOp with desc
 TEST(NCCL, ncclInitOp) {
-  std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op_desc(new f::OpDesc);
 
   op_desc->SetType("ncclInit");
   op_desc->SetOutput("Communicator", {"x1"});
   op_desc->SetAttr("gpus", {gpu_list});
 
   f::Scope g_scope;
-  std::unique_ptr<p::DeviceContext> ctx(new p::CPUDeviceContext(p::CPUPlace()));
+  paddle::platform::CPUPlace cpu_place;
 
   auto *var = g_scope.Var("x1");
   var->GetMutable<p::Communicator>();
 
   auto op = f::OpRegistry::CreateOp(*op_desc);
   VLOG(1) << "invoke NCCLInitOp.";
-  op->Run(g_scope, *ctx.get());
+  op->Run(g_scope, cpu_place);
   VLOG(1) << "NCCLInitOp finished.";
 }
 
 // ncclAllReduceOp with desc
 TEST_F(NCCLTester, ncclAllReduceOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   op2->SetType("ncclAllReduce");
   op2->SetInput("X", {"st"});
   op2->SetInput("Communicator", {"comm"});
@@ -171,7 +171,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
 
   for (size_t i = 0; i < dev_scopes.size(); ++i) {
     p::CPUPlace cpu_place;
-    p::GPUPlace gpu_place(gpu_list[i]);
+    p::CUDAPlace gpu_place(gpu_list[i]);
 
     auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
     auto *rt = recv_tensor.data<float>();
@@ -180,7 +180,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
     auto *ct = result_tensor->mutable_data<float>(cpu_place);
 
     paddle::memory::Copy(
-        cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
+        cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt,
         recv_tensor.numel() * sizeof(float),
         static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
 
@@ -192,7 +192,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
 
 // ncclReduceOp with desc
 TEST_F(NCCLTester, ncclReduceOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   const int kRoot = 0;
   op2->SetType("ncclReduce");
   op2->SetInput("X", {"st"});
@@ -219,7 +219,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
   float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
 
   p::CPUPlace cpu_place;
-  p::GPUPlace gpu_place(gpu_list[kRoot]);
+  p::CUDAPlace gpu_place(gpu_list[kRoot]);
 
   auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
   auto *rt = recv_tensor.data<float>();
@@ -229,7 +229,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
   auto *ct = result_tensor->mutable_data<float>(cpu_place);
 
   paddle::memory::Copy(
-      cpu_place, ct, p::GPUPlace(gpu_list[kRoot]), rt,
+      cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt,
       recv_tensor.numel() * sizeof(float),
       static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
 
@@ -240,7 +240,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
 
 // ncclBcastOp with desc
 TEST_F(NCCLTester, ncclBcastOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   const int kRoot = 5;
   op2->SetType("ncclBcast");
   op2->SetInput("X", {"st"});
@@ -268,7 +268,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
   float result = kRoot;
 
   p::CPUPlace cpu_place;
-  p::GPUPlace gpu_place(gpu_list[idx]);
+  p::CUDAPlace gpu_place(gpu_list[idx]);
 
   auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
   auto *rt = recv_tensor.data<float>();
@@ -277,7 +277,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
   auto *ct = result_tensor->mutable_data<float>(cpu_place);
 
   paddle::memory::Copy(
-      cpu_place, ct, p::GPUPlace(gpu_list[idx]), rt,
+      cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt,
       recv_tensor.numel() * sizeof(float),
       static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
 
@@ -295,9 +295,18 @@ int main(int argc, char **argv) {
     return 0;
   }
 
-  for (int i = 0; i < dev_count; ++i) {
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::CUDAPlace(i));
     gpu_list.emplace_back(i);
   }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Init(places);
+
   testing::InitGoogleTest(&argc, argv);
 
   // device context should be release before scope.
diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc
index 5ad1610fde041ee934486ef98ba41dca42559100..d39ca87d53518963f652f7b8c8cb289a6fef70fd 100644
--- a/paddle/operators/nce_op.cc
+++ b/paddle/operators/nce_op.cc
@@ -63,7 +63,7 @@ class NCEOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
@@ -73,7 +73,7 @@ class NCEOp : public framework::OperatorWithKernel {
 
 class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCEOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  NCEOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim].");
     AddInput(
@@ -166,7 +166,7 @@ class NCEOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h
index 6636dad06037f163252dc342200a99c756ed2a2e..e6b496f7896dcb412be8ff096fdccb2f0b682369 100644
--- a/paddle/operators/nce_op.h
+++ b/paddle/operators/nce_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h
index 8935751f15ccc4861c9e06d8d9031c8dff1a4af3..85d0153b32c0ba53bfe0912fc2682c8b635ba172 100644
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -65,9 +65,9 @@ class NetOp : public framework::OperatorBase {
    * will be used.
    */
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& place) const override {
     for (auto& op : ops_) {
-      op->Run(scope, dev_ctx);
+      op->Run(scope, place);
     }
   }
 
diff --git a/paddle/operators/net_op_test.cc b/paddle/operators/net_op_test.cc
index 22fba9568d018586b4622884b7d6145fd646adb0..dfd86546e83a6276aedd198eaeb6fad2c50944df 100644
--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
@@ -13,8 +13,7 @@ class TestOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
   DEFINE_OP_CLONE_METHOD(TestOp);
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+  void Run(const Scope& scope, const platform::Place& place) const override {
     ++run_cnt;
   }
 };
diff --git a/paddle/operators/batch_norm_op.md b/paddle/operators/op_documentation/batch_norm_op.md
similarity index 100%
rename from paddle/operators/batch_norm_op.md
rename to paddle/operators/op_documentation/batch_norm_op.md
diff --git a/paddle/operators/name_convention.md b/paddle/operators/op_documentation/name_convention.md
similarity index 96%
rename from paddle/operators/name_convention.md
rename to paddle/operators/op_documentation/name_convention.md
index b5cb176e003b4584321142ac9f1c3380b7010936..a02b356f058da68442516c2705d0bac140f8ef18 100644
--- a/paddle/operators/name_convention.md
+++ b/paddle/operators/op_documentation/name_convention.md
@@ -35,8 +35,8 @@ Here we give some examples to show how these rules will be used.
 ```c++
 class AccumulateOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AccumulateOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  AccumulateOpMaker(OpProto *proto,
+                    OpAttrChecker *op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor.
     If the output size is not the same as input size,
diff --git a/paddle/operators/net_op_design.md b/paddle/operators/op_documentation/net_op_design.md
similarity index 100%
rename from paddle/operators/net_op_design.md
rename to paddle/operators/op_documentation/net_op_design.md
diff --git a/paddle/operators/op_documentation/op_markdown_format.md b/paddle/operators/op_documentation/op_markdown_format.md
new file mode 100644
index 0000000000000000000000000000000000000000..0ee804d592252c727622cbe59b0644813db3c4fd
--- /dev/null
+++ b/paddle/operators/op_documentation/op_markdown_format.md
@@ -0,0 +1,64 @@
+# Standard Markdown Format for Operators
+The following should be the standard format for documentation for all the operators that will get rendered in the `html`:
+
+```
+Operator Name (In PaddlePaddle)
+
+Operator Name (Standard)
+
+Operator description.
+
+LaTeX equation of how the operator performs an update.
+
+The signature of the operator.
+```
+
+Each section mentioned above has been covered in further detail in the rest of the document.
+
+# PaddlePaddle Operator Name
+This should be in all small letters, in case of multiple words, we separate them with an underscore. For example:
+`array to lod tensor` should be written as `array_to_lod_tensor`.
+
+This naming convention should be standard across all PaddlePaddle operators.
+
+# Standard Operator Name
+This is the standard name of the operator as used in the community. The general standard is usually:
+- Standard abbreviations like `SGD` are written in all capital letters.
+- Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word).
+- Keep numbers inside a word as is, with no boundary delimiters.
+- Follow the name of the operator with the keyword: `Activation Operator.`
+
+# Operator description
+This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section.
+
+# LaTeX equation
+This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`).
+
+# The signature
+This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is:
+`Section :
+VariableName : (VariableType) VariableDescription
+...
+...
+`
+
+
+The following example for an `sgd` operator covers the above mentioned sections as they would ideally look like in the `html`:
+
+```
+sgd
+
+SGD operator
+
+This operator implements one step of the stochastic gradient descent algorithm.
+
+param_out = param_learning_rate * grad
+
+Inputs:
+Param : (Tensor) Input parameter
+LearningRate : (Tensor) Learning rate of SGD
+Grad : (Tensor) Input gradient
+
+Outputs:
+ParamOut : (Tensor) Output parameter
+```
diff --git a/paddle/operators/rnn_design.md b/paddle/operators/op_documentation/rnn_design.md
similarity index 100%
rename from paddle/operators/rnn_design.md
rename to paddle/operators/op_documentation/rnn_design.md
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index 936dde22c34a30c5a50e2ac8a76f0f91dfb328ab..90c53bd17732aa046beceb3ac0a3b8c0d69994f3 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/pad_op.h"
 
@@ -48,7 +48,7 @@ class PadOp : public framework::OperatorWithKernel {
 
 class PadOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PadOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  PadOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input of pad op. "
@@ -116,14 +116,14 @@ class PadOpGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto* bind = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* bind = new framework::OpDesc();
     bind->SetInput("X", Input("X"));
     bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     bind->SetAttrMap(Attrs());
     bind->SetType("pad_grad");
-    return std::unique_ptr<framework::OpDescBind>(bind);
+    return std::unique_ptr<framework::OpDesc>(bind);
   }
 };
 
diff --git a/paddle/operators/pad_op.cu b/paddle/operators/pad_op.cu
index c309fb625cca203418db2599a59ea0144782efc2..433b5f1112a27b36edbe6d99fcdd4fc8395bc2e8 100644
--- a/paddle/operators/pad_op.cu
+++ b/paddle/operators/pad_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/pad_op.h"
diff --git a/paddle/operators/pad_op.h b/paddle/operators/pad_op.h
index 1b95942af3b3711fcad965cdc3f2d2f99b2f32e8..fdf91a5776620485c38a8b2c5f8b26039e438d0c 100644
--- a/paddle/operators/pad_op.h
+++ b/paddle/operators/pad_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/pool_cudnn_op.cu.cc b/paddle/operators/pool_cudnn_op.cu.cc
index fc2b37bd0fbac82005e709779b2939843b839596..2d0001ba1184c99d9fc642f60c97ba89cec97ccd 100644
--- a/paddle/operators/pool_cudnn_op.cu.cc
+++ b/paddle/operators/pool_cudnn_op.cu.cc
@@ -29,7 +29,7 @@ class PoolCudnnOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
 
     const Tensor *input = ctx.Input<Tensor>("X");
     Tensor *output = ctx.Output<Tensor>("Out");
@@ -90,7 +90,7 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
 
     const Tensor *input = ctx.Input<Tensor>("X");
     const Tensor *output = ctx.Input<Tensor>("Out");
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index 45fa20280c1ad20f63d6542d5199e002ff60495f..50057eb6483e9c9e745bc07dee26a0bbbbb5a48c 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -67,8 +67,7 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
   ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
 }
 
-Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput(
       "X",
@@ -136,8 +135,7 @@ Example:
 )DOC");
 }
 
-Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput("X",
            "(Tensor) The input tensor of pooling operator. "
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
index ab85d587a3131237d7a9ec774a11193c70220c7c..3860e295f4b4dbeb2d60cfb304847de39083f1e1 100644
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -40,14 +40,12 @@ class PoolOpGrad : public framework::OperatorWithKernel {
 
 class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Pool2dOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Pool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
 class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Pool3dOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Pool3dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index 1a2383f8b80357d2927c3b6a8c57c787ba7e366d..76c5123527c3ff5b7e6c7eec39f4eb1d612759d3 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -69,7 +69,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
@@ -90,7 +90,7 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
@@ -100,8 +100,7 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
 
 class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxPool2dWithIndexOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  MaxPool2dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
@@ -178,8 +177,7 @@ Example:
 
 class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxPool3dWithIndexOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  MaxPool3dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor) The input tensor of pooling operator. "
diff --git a/paddle/operators/positive_negative_pair_op.cc b/paddle/operators/positive_negative_pair_op.cc
index 4ba40a62ec5f696ad980c2913f7e162879a557e2..a6b23c995b8b9104f2da9d3d29ceb3eb88e7da63 100644
--- a/paddle/operators/positive_negative_pair_op.cc
+++ b/paddle/operators/positive_negative_pair_op.cc
@@ -85,7 +85,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Score")->type()),
@@ -95,8 +95,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
 
 class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PositiveNegativePairOpMaker(framework::OpProto *proto,
-                              framework::OpAttrChecker *op_checker)
+  PositiveNegativePairOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Score",
              "(Tensor, float) Model Score on an item (with "
@@ -155,13 +154,14 @@ class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
         "Noting that reducing on the first dim will make the LoD info lost.")
         .SetDefault(0);
     AddComment(R"DOC(
-        PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) 
-        model performance. 
-        Within some context, e.g. the "query", a LTR model generates scores
-        for a list of items, which gives a partial order of the items.
-        PositiveNegativePairOp takes a list of reference rank order 
-        (Input("Label")) and the model generated scores (Input(Score)) as 
-        inputs and counts the pairs that ranked correctly and incorrectly.
+PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) model's
+performance.
+
+Within some context, e.g. the "query", a LTR model generates scores for a list
+of items, which gives a partial order of the items. PositiveNegativePairOp
+takes a list of reference rank order (Input("Label")) and the model generated
+scores (Input(Score)) as inputs and counts the pairs that ranked correctly
+and incorrectly.
 )DOC");
   }
 };
diff --git a/paddle/operators/precision_recall_op.cc b/paddle/operators/precision_recall_op.cc
index 1ace4f2a5935dcb4239526c42599a42d288ff552..c5753147effd17c012683e1058e34af46288f366 100644
--- a/paddle/operators/precision_recall_op.cc
+++ b/paddle/operators/precision_recall_op.cc
@@ -80,7 +80,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type()),
@@ -90,8 +90,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
 
 class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PrecisionRecallOpMaker(framework::OpProto *proto,
-                         framework::OpAttrChecker *op_checker)
+  PrecisionRecallOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("MaxProbs",
              "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
index 317a2a40154f92f2e13a3012d2f7a63df9a69afb..ddc21a657024dcc800726475fa6242f8e6576ad1 100644
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/prelu_op.h"
 #include "paddle/operators/net_op.h"
@@ -38,7 +38,7 @@ class PReluOp : public framework::OperatorWithKernel {
 
 class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  PReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of prelu operator.");
     AddInput("Alpha", "The alpha weight of prelu operator.");
diff --git a/paddle/operators/prelu_op.cu b/paddle/operators/prelu_op.cu
index 12033dee0e1c190b08080023d6746fcad48db2fd..1718bb5cd65f48eba391023e4374a30e405a164d 100644
--- a/paddle/operators/prelu_op.cu
+++ b/paddle/operators/prelu_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/prelu_op.h"
 
diff --git a/paddle/operators/proximal_adagrad_op.cc b/paddle/operators/proximal_adagrad_op.cc
index cc350f6d26e6d8bd6e59f2fda74a3b734df55247..b92f46b5bd4e48a25f8c87873c5df53f1753b71b 100644
--- a/paddle/operators/proximal_adagrad_op.cc
+++ b/paddle/operators/proximal_adagrad_op.cc
@@ -59,8 +59,7 @@ class ProximalAdagradOp : public framework::OperatorWithKernel {
 
 class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ProximalAdagradOpMaker(framework::OpProto *proto,
-                         framework::OpAttrChecker *op_checker)
+  ProximalAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
diff --git a/paddle/operators/proximal_gd_op.cc b/paddle/operators/proximal_gd_op.cc
index 0b26beb3ac3803c78f45cc2ce0a8f444bdc313b6..2d3bbdaf320a4d6bdf18ec92230a81ad98371498 100644
--- a/paddle/operators/proximal_gd_op.cc
+++ b/paddle/operators/proximal_gd_op.cc
@@ -47,8 +47,7 @@ class ProximalGDOp : public framework::OperatorWithKernel {
 
 class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ProximalGDOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
+  ProximalGDOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index b80b175792f3fc56d689c187b7182198542d7345..f2164a0f80519ed9c2490ab3aa6809dc84c6070d 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-       http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/rank_loss_op.h"
 
@@ -45,8 +45,7 @@ class RankLossOp : public framework::OperatorWithKernel {
 
 class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RankLossOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  RankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Label",
              "(2-D Tensor with shape [batch_size x 1]) "
diff --git a/paddle/operators/rank_loss_op.cu b/paddle/operators/rank_loss_op.cu
index 5aee66443d60c8e20625880ba2ec9606b8a007a0..294b22738347b17dd67df05291ac496bfb608323 100644
--- a/paddle/operators/rank_loss_op.cu
+++ b/paddle/operators/rank_loss_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-       http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/rank_loss_op.h"
 
diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h
index ea24b61fd94b57950e79b7c1ddb13fa165953538..bd0c49ca6e42bcb1a25c53421c0e672cecbb3a15 100644
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-       http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 232d926f7b975c3b8ebecad983d0f1cc54b9486f..056fa46949cd623845956521b068109085a8795e 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <vector>
 #include "paddle/framework/executor.h"
@@ -227,14 +227,15 @@ class RecurrentOp : public RecurrentBase {
       : RecurrentBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
     VLOG(3) << "Static RNN input sequence length = " << seq_len;
     StepScopes scopes = CreateStepScopes(scope, seq_len);
     auto reverse = Attr<bool>(kReverse);
 
-    framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    framework::Executor executor(place);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
     auto *program = block->Program();
 
     for (size_t i = 0; i < seq_len; ++i) {
@@ -270,6 +271,11 @@ class RecurrentOp : public RecurrentBase {
       executor.Run(*program, &cur_scope, block->ID(),
                    false /*create_local_scope*/);
 
+      // get device context from pool
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
+
       // Copy inside::output -> outside::output
       //    outside::output[seq_offset: seq_offset + 1] = inside::output
       this->LinkTensorWithCallback(
@@ -278,14 +284,13 @@ class RecurrentOp : public RecurrentBase {
               framework::LoDTensor *dst_tensor) {
             if (i == 0) {  // create output tensor at begin
               dst_tensor->Resize(PrependDims(seq_len, src_tensor.dims()));
-              dst_tensor->mutable_data(dev_ctx.GetPlace(), src_tensor.type());
+              dst_tensor->mutable_data(place, src_tensor.type());
             }
 
             auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
             // Explicit copy output since the local RNN scope can be destroyed
             // early.
-            framework::CopyFrom(src_tensor, dev_ctx.GetPlace(), dev_ctx,
-                                &dst_out);
+            framework::CopyFrom(src_tensor, place, dev_ctx, &dst_out);
           });
 
       scopes.Next();
@@ -311,15 +316,20 @@ class RecurrentGradOp : public RecurrentBase {
       : RecurrentBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto seq_len = static_cast<size_t>(GetSequenceLength(scope));
     StepScopes scopes = CreateStepScopes(scope, seq_len);
     auto reverse = Attr<bool>(kReverse);
 
-    framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    framework::Executor executor(place);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
     auto *program = block->Program();
 
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
     for (size_t step_id = 0; step_id < seq_len; ++step_id) {
       size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
       VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
@@ -366,8 +376,7 @@ class RecurrentGradOp : public RecurrentBase {
           auto *cur_grad_var = cur_scope.Var(cur_grad);
           auto cur_grad_tensor =
               cur_grad_var->GetMutable<framework::LoDTensor>();
-          framework::CopyFrom(ex_tensor, dev_ctx.GetPlace(), dev_ctx,
-                              cur_grad_tensor);
+          framework::CopyFrom(ex_tensor, place, dev_ctx, cur_grad_tensor);
         }
       }
 
@@ -410,7 +419,7 @@ class RecurrentGradOp : public RecurrentBase {
             auto zero_op = framework::OpRegistry::CreateOp(
                 "fill_constant", framework::VariableNameMap{},
                 {{"Out", {pg_names[param_id]}}}, attrs);
-            zero_op->Run(scope, dev_ctx);
+            zero_op->Run(scope, place);
           }
 
           auto new_inside_name = cur_scope.Rename(inside_grad_name);
@@ -419,7 +428,7 @@ class RecurrentGradOp : public RecurrentBase {
           auto sum_op = framework::OpRegistry::CreateOp(
               "sum", {{"X", {pg_names[param_id], new_inside_name}}},
               {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
-          sum_op->Run(cur_scope, dev_ctx);
+          sum_op->Run(cur_scope, place);
 
           cur_scope.Rename(new_inside_name, inside_grad_name);
         }
@@ -437,11 +446,11 @@ class RecurrentGradOp : public RecurrentBase {
             }
             if (step_id == 0) {  // alloc memory
               outside->Resize(PrependDims(seq_len, inside.dims()));
-              outside->mutable_data(dev_ctx.GetPlace(), inside.type());
+              outside->mutable_data(place, inside.type());
             }
 
             auto dst = outside->Slice(seq_offset, seq_offset + 1);
-            framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, &dst);
+            framework::CopyFrom(inside, place, dev_ctx, &dst);
           });
       VLOG(5) << "Link outside gradient finished ";
 
@@ -453,8 +462,8 @@ class RecurrentGradOp : public RecurrentBase {
             [&](const framework::LoDTensor &inside,
                 framework::LoDTensor *outside) {
               outside->Resize(inside.dims());
-              outside->mutable_data(dev_ctx.GetPlace(), inside.type());
-              framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, outside);
+              outside->mutable_data(place, inside.type());
+              framework::CopyFrom(inside, place, dev_ctx, outside);
             });
         VLOG(5) << "Link initialize state gradient finished ";
       }
@@ -483,7 +492,7 @@ class RecurrentGradOp : public RecurrentBase {
 
   std::unordered_set<std::string> LocalVarNames(
       const framework::Scope &scope) const {
-    return this->List2Set(scope.GetAllNames(false));
+    return this->List2Set(scope.LocalVarNames());
   }
   static std::vector<std::string> GradVarLists(
       const std::vector<std::string> &var_names) {
@@ -497,8 +506,7 @@ class RecurrentGradOp : public RecurrentBase {
 
 class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RecurrentOpProtoMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
+  RecurrentOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(kInputs, "rnn inputs").AsDuplicable();
     AddInput(kInitialStates, "rnn initial states").AsDuplicable();
@@ -523,8 +531,7 @@ The ex-state means the state value in the ex-timestep or the previous time step
         string::Sprintf(
             "The state variable names. [%s, %s, %s] must be the same order",
             kExStates, kStates, kInitStateGrads));
-    AddAttr<framework::BlockDescBind *>(kStepBlock,
-                                        "The step block inside RNN");
+    AddAttr<framework::BlockDesc *>(kStepBlock, "The step block inside RNN");
     AddAttr<bool>(kReverse, R"DOC(Calculate RNN reversely or not.
 By default reverse=False
 
@@ -566,13 +573,13 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  virtual std::unique_ptr<framework::OpDescBind> Apply() const {
-    auto *grad = new framework::OpDescBind();
+  virtual std::unique_ptr<framework::OpDesc> Apply() const {
+    auto *grad = new framework::OpDesc();
     grad->SetType("recurrent_grad");
     for (auto &input_param : this->InputNames()) {
       grad->SetInput(input_param, this->Input(input_param));
       grad->SetOutput(framework::GradVarName(input_param),
-                      this->InputGrad(input_param));
+                      this->InputGrad(input_param, false));
     }
 
     for (auto &output_param : this->OutputNames()) {
@@ -589,7 +596,7 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
     grad->SetAttrMap(this->Attrs());
     grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
 
-    return std::unique_ptr<framework::OpDescBind>(grad);
+    return std::unique_ptr<framework::OpDesc>(grad);
   }
 };
 
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
index eed482c1b458cd442ede523838b400d85c23a155..322f8571cfd4341f064e8f9df512a8d74b91ed9d 100644
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <stdint.h>
 #include <sys/stat.h>
@@ -24,9 +24,12 @@
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/proto_desc.h"
 #include "paddle/operators/detail/send_recv_impl.h"
 #include "paddle/operators/detail/simple_block_queue.h"
 
+#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
+
 namespace paddle {
 namespace operators {
 
@@ -38,7 +41,7 @@ void RunServer(Server **rpc_server,
   builder.RegisterService(service.get());
   std::unique_ptr<Server> server(builder.BuildAndStart());
   *rpc_server = server.get();
-  LOG(INFO) << "Server listening on " << server_address << std::endl;
+  LOG(INFO) << "Server listening on " << server_address;
   server->Wait();
 }
 
@@ -56,34 +59,94 @@ class RecvOp : public framework::OperatorBase {
     }
   }
 
-  virtual ~RecvOp() {
+  void Stop() override {
+    detail::TensorWithName term_msg;
+    term_msg.first = LISTEN_TERMINATE_MESSAGE;
+    rpc_service_->Push(term_msg);
     rpc_server_->Shutdown();
     server_thread_->join();
   }
 
+  std::string GetGradVarNameForTrainer(const std::string &varname) const {
+    if (grads_counter_.find(varname) == grads_counter_.end()) {
+      grads_counter_[varname] = 0;
+    }
+    char ret[256];
+    snprintf(ret, sizeof(ret), "%s.trainer_%d", varname.c_str(),
+             grads_counter_[varname]++);
+    return std::string(ret);
+  }
+
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
-    // blocking get one var from client.
-    const framework::LoDTensor &t = rpc_service_->Get();
+           const platform::Place &dev_place) const override {
+    // FIXME(typhoonzero): no new scopes for every run.
     framework::Scope &recv_scope = scope.NewScope();
-    // set graph input var
-    auto *var = recv_scope.Var(Input("RX"));
-    auto *tensor = var->GetMutable<framework::LoDTensor>();
-    // FIXME(typhoonzero): do not copy
-    framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor);
-
-    std::string program_str = Attr<std::string>("OptimizeProgram");
-    framework::ProgramDesc program_desc;
-    program_desc.ParseFromString(program_str);
-    framework::ProgramDescBind program(program_desc);
-    framework::Executor executor(dev_ctx);
-    // Run sub graph to get optimized tensor
-    executor.Run(program, &recv_scope, 0, /*global_block*/
-                 false /*create_local_scope*/);
-
-    auto *out_var = recv_scope.FindVar("Out");
-    // push back
-    rpc_service_->Push(out_var->Get<framework::LoDTensor>());
+    rpc_service_->SetScope(&recv_scope);
+    auto param_list = Attr<std::vector<std::string>>("ParamList");
+    auto grad_list = Attr<std::vector<std::string>>("GradList");
+    auto trainer_count = Attr<int>("Trainers");
+    size_t param_count = param_list.size();
+    rpc_service_->Reset();
+    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
+    bool exit_flag = false;
+    while (!exit_flag) {
+      // Get from multiple trainers, we don't care about order in which
+      // the gradient arrives, just add suffix 0~n then average the gradient.
+      for (size_t i = 0; i < param_count * trainer_count; ++i) {
+        // blocking get one var from client.
+        const detail::TensorWithName &v = rpc_service_->Get();
+        auto grad_var_name = v.first;
+        if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
+          exit_flag = true;
+          break;
+        }
+        auto it = std::find(grad_list.begin(), grad_list.end(), grad_var_name);
+        std::string param_var_name;
+        if (it != grad_list.end()) {
+          param_var_name = param_list[it - grad_list.begin()];
+        } else {
+          LOG(ERROR) << "grad have no paired param found!";
+        }
+        VLOG(3) << "recved grad: " << grad_var_name
+                << " updating param: " << param_var_name;
+        auto *merged_grad = recv_scope.FindVar(grad_var_name);
+        if (merged_grad == nullptr) {
+          // create output of merged var.
+          auto merged_var = recv_scope.Var(grad_var_name);
+          merged_var->GetMutable<framework::LoDTensor>();
+        }
+
+        if (trainer_count > 1) {
+          grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
+        }
+
+        auto *var = recv_scope.Var(grad_var_name);
+        auto *tensor = var->GetMutable<framework::LoDTensor>();
+        // FIXME(typhoonzero): do not copy
+        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+        auto &dev_ctx = *pool.Borrow(dev_place);
+        framework::CopyFrom(v.second, dev_place, dev_ctx, tensor);
+      }
+      if (exit_flag) {
+        break;
+      }
+      rpc_service_->Reset();
+
+      std::string program_str = Attr<std::string>("OptimizeProgram");
+      framework::proto::ProgramDesc program_desc;
+      program_desc.ParseFromString(program_str);
+      framework::ProgramDesc program(program_desc);
+      framework::Executor executor(dev_place);
+      // Run sub graph to get optimized tensor
+      try {
+        executor.Run(program, &recv_scope, 0, /*global_block*/
+                     false /*create_local_scope*/, false /*create_vars*/);
+      } catch (std::exception &e) {
+        LOG(ERROR) << "run sub program error " << e.what();
+      }
+      rpc_service_->Done();
+      grads_counter_.clear();
+    }  // while(true)
   }
 
  protected:
@@ -93,13 +156,14 @@ class RecvOp : public framework::OperatorBase {
   // grpc send/recv service implement to register.
   std::shared_ptr<detail::SendRecvServerImpl> rpc_service_;
   std::shared_ptr<std::thread> server_thread_;
+  mutable std::unordered_map<std::string, int> grads_counter_;
 };
 
 class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RecvOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("RX", "(Tensor) Input tensor to be saved");
+    AddInput("RX", "(Tensor) Input tensor to be optimized").AsDuplicable();
     AddComment(R"DOC(
 Recv operator
 
@@ -112,6 +176,17 @@ This operator will recv tensor from send_op
         .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
     AddAttr<std::string>("OptimizeProgram", "type string",
                          "Serialized ProgramDesc string for recv to run.");
+    AddAttr<std::vector<std::string>>(
+        "ParamList", "type list of string",
+        "grad->param name mapping to find which param to optimize.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        "GradList", "type list of string",
+        "grad->param name mapping to find which param to optimize.")
+        .SetDefault({});
+    AddAttr<int>("Trainers", "type int",
+                 "Number of trainers in the current cluster job")
+        .SetDefault(1);
   }
 };
 
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
index fedc2a5c37ff84ffdf8ebd2f19296db92e256e5b..a3ff4a6ca0ef30be42e7801386a3561930638a8a 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/reduce_op.h"
 #include "paddle/operators/net_op.h"
@@ -83,7 +83,7 @@ class ReduceGradOp : public framework::OperatorWithKernel {
 
 class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReduceOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor) The input tensor. Tensors with rank at most 6 are "
@@ -135,8 +135,7 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.
 
 class ReduceSumOpMaker : public ReduceOpMaker {
  public:
-  ReduceSumOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ReduceSumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : ReduceOpMaker(proto, op_checker) {
     SetComment("ReduceSum", "sum");
     AddComment(comment_);
@@ -145,8 +144,7 @@ class ReduceSumOpMaker : public ReduceOpMaker {
 
 class ReduceMeanOpMaker : public ReduceOpMaker {
  public:
-  ReduceMeanOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
+  ReduceMeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : ReduceOpMaker(proto, op_checker) {
     SetComment("ReduceMean", "mean");
     AddComment(comment_);
@@ -155,8 +153,7 @@ class ReduceMeanOpMaker : public ReduceOpMaker {
 
 class ReduceMaxOpMaker : public ReduceOpMaker {
  public:
-  ReduceMaxOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ReduceMaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : ReduceOpMaker(proto, op_checker) {
     SetComment("ReduceMax", "max");
     AddComment(comment_);
@@ -165,8 +162,7 @@ class ReduceMaxOpMaker : public ReduceOpMaker {
 
 class ReduceMinOpMaker : public ReduceOpMaker {
  public:
-  ReduceMinOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ReduceMinOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : ReduceOpMaker(proto, op_checker) {
     SetComment("ReduceMin", "min");
     AddComment(comment_);
diff --git a/paddle/operators/reduce_op.cu b/paddle/operators/reduce_op.cu
index a10ace5253b850db5855bef8384278edebc9e45f..1dd948ed8a79cce8468f2fe210b5636e7dd1f99e 100644
--- a/paddle/operators/reduce_op.cu
+++ b/paddle/operators/reduce_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/reduce_op.h"
diff --git a/paddle/operators/reduce_op.h b/paddle/operators/reduce_op.h
index 7bd99cb1e6d532963ef648202f460f363baad9b5..da5f3977769990a45c94db21f5dbd01ac70ac06e 100644
--- a/paddle/operators/reduce_op.h
+++ b/paddle/operators/reduce_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d652ff806461cea3d0e8d3bd70704b4b6bc2173
--- /dev/null
+++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
@@ -0,0 +1,235 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+class ReorderLoDTensorByRankTableOpProtoMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReorderLoDTensorByRankTableOpProtoMaker(OpProto *proto,
+                                          OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) the input lod tensor need to be reordered.");
+    AddInput("RankTable",
+             "(LoDRankTable) the rank table that input need follow");
+    AddOutput("Out", "(LoDTensor) reordered lod tensor");
+    AddComment(R"DOC(ReorderLoDTensorByRankTable
+
+Reorder the input X by the rank of `RankTable`. If `RankTable` is ordered by
+index [3, 0, 2, 1]. Input X will reorder its sequence, the third sequence of
+X will be the first sequence of Output.
+
+NOTE: The RankTable does not need to be calculated by X.
+
+For example:
+The X = [Seq0, Seq1, Seq2, Seq3]. The indices of RankTable are [3, 0, 2, 1].
+
+The Out =  [Seq3, Seq0, Seq2, Seq1] with correct LoD information.
+)DOC");
+  }
+};
+
+class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
+ public:
+  ReorderLoDTensorByRankTableBase(const std::string &type,
+                                  const framework::VariableNameMap &inputs,
+                                  const framework::VariableNameMap &outputs,
+                                  const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto &x =
+        detail::Ref(scope.FindVar(Input("X")),
+                    "Cannot find input lod tensor variable %s", Input("X"))
+            .Get<framework::LoDTensor>();
+    auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")),
+                                   "Cannot find input rank table variable %s",
+                                   Input("RankTable"))
+                           .Get<framework::LoDRankTable>();
+    auto &out =
+        *detail::Ref(scope.FindVar(Output("Out")),
+                     "Cannot find output lod tensor variable %s", Output("Out"))
+             .GetMutable<framework::LoDTensor>();
+
+    out.Resize(x.dims());
+    out.mutable_data(x.place(), x.type());
+    this->process(place, x, rank_table, &out);
+  }
+
+ protected:
+  virtual void process(const platform::Place &place,
+                       const framework::LoDTensor &x,
+                       const framework::LoDRankTable &rank_table,
+                       framework::LoDTensor *out) const = 0;
+
+  struct AbsoluteRankTableItem {
+    size_t offset;  // the absolute/accumulated offset.
+    size_t length;  // the length
+    framework::LoD lod;
+  };
+
+  std::vector<AbsoluteRankTableItem> GetAbsoluteOffsetAndLengthByLoDRankTable(
+      const framework::LoDTensor &x) const {
+    std::vector<AbsoluteRankTableItem> absolute_table;
+    size_t level = 0;
+    size_t size = x.lod()[level].size();
+
+    for (size_t i = 0; i < size - 1; ++i) {
+      auto lod_offset =
+          framework::GetSubLoDAndAbsoluteOffset(x.lod(), i, i + 1, level);
+
+      auto &offset = lod_offset.second;
+
+      absolute_table.emplace_back();
+      absolute_table.back().length = offset.second - offset.first;
+      absolute_table.back().offset = offset.first;
+      absolute_table.back().lod = lod_offset.first;
+    }
+    return absolute_table;
+  }
+
+  size_t CopyTensorAndLod(const platform::Place &place,
+                          const AbsoluteRankTableItem &item,
+                          const framework::LoDTensor &x,
+                          framework::LoDTensor *out, size_t out_offset) const {
+    auto &out_lod = *out->mutable_lod();
+    auto len = item.length;
+    auto x_offset = item.offset;
+
+    if (out_lod.empty()) {
+      for (size_t i = 0; i < item.lod.size(); ++i) {
+        out_lod.push_back(std::vector<size_t>({0}));
+      }
+    }
+
+    for (size_t i = 0; i < out_lod.size(); ++i) {
+      auto &out_v = out_lod[i];
+      auto &new_lod_v = item.lod[i];
+
+      for (auto &detail : new_lod_v) {
+        out_v.push_back(out_v.back() + detail);
+      }
+    }
+
+    auto x_sliced = x.Slice(x_offset, x_offset + len);
+    auto out_sliced = out->Slice(out_offset, out_offset + len);
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
+    out_offset += len;
+    return out_offset;
+  }
+};
+
+class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase {
+ public:
+  ReorderLoDTensorByRankTableOp(const std::string &type,
+                                const framework::VariableNameMap &inputs,
+                                const framework::VariableNameMap &outputs,
+                                const framework::AttributeMap &attrs)
+      : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  void process(const platform::Place &place, const framework::LoDTensor &x,
+               const framework::LoDRankTable &rank_table,
+               framework::LoDTensor *out) const override {
+    auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
+    size_t out_offset = 0;
+    out->mutable_lod()->clear();
+    for (auto &item : rank_table.items()) {
+      PADDLE_ENFORCE_LT(item.index, absolute_table.size());
+      out_offset = CopyTensorAndLod(place, absolute_table[item.index], x, out,
+                                    out_offset);
+    }
+  }
+};
+
+class IdentityInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+};
+
+class ReorderLodTensorByRankGradOpMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("reorder_lod_tensor_by_rank_grad");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetInput("RankTable", Input("RankTable"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class ReorderLoDTensorByRankGradOp : public ReorderLoDTensorByRankTableBase {
+ public:
+  ReorderLoDTensorByRankGradOp(const std::string &type,
+                               const framework::VariableNameMap &inputs,
+                               const framework::VariableNameMap &outputs,
+                               const framework::AttributeMap &attrs)
+      : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  void process(const platform::Place &place, const framework::LoDTensor &x,
+               const framework::LoDRankTable &rank_table,
+               framework::LoDTensor *out) const override {
+    auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
+
+    // offsets = enumerate([item.index for item in rank_table.items()])
+    std::vector<std::pair<size_t, size_t>> offsets;
+    offsets.reserve(rank_table.items().size());
+    for (size_t i = 0; i < rank_table.items().size(); ++i) {
+      offsets.push_back({i, rank_table.items()[i].index});
+    }
+
+    // offsets.sort(key=lambda x: x[1])
+    std::sort(
+        offsets.begin(), offsets.end(),
+        [](const std::pair<size_t, size_t> &a,
+           const std::pair<size_t, size_t> &b) { return a.second < b.second; });
+
+    // Copy TensorAndLod
+    size_t out_offset = 0;
+    for (auto &offset : offsets) {
+      out_offset = this->CopyTensorAndLod(place, absolute_table[offset.first],
+                                          x, out, out_offset);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(reorder_lod_tensor_by_rank,
+                  ops::ReorderLoDTensorByRankTableOp,
+                  ops::ReorderLodTensorByRankGradOpMaker,
+                  ops::ReorderLoDTensorByRankTableOpProtoMaker,
+                  ops::IdentityInferShape);
+REGISTER_OPERATOR(reorder_lod_tensor_by_rank_grad,
+                  ops::ReorderLoDTensorByRankGradOp, ops::IdentityInferShape);
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index 7fd33bf662a1d0b7b6fa4e772bdadbf34b2f4fdd..58e8fd6124d8c076337ae9bb2f5103e7a3cb7ff0 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-       http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/reshape_op.h"
 
@@ -34,21 +34,33 @@ class ReshapeOp : public framework::OperatorWithKernel {
     auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
     PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
     auto x_dims = ctx->GetInputDim("X");
-    // TODO(qiao) change batch_size
-    for (size_t i = 1; i < shape.size(); ++i) {
-      PADDLE_ENFORCE(shape[i] > 0,
-                     "Each dimension of Attr(shape) "
-                     "must be positive except the first one.");
-    }
-    if (shape[0] < 0) {
-      shape[0] = x_dims[0];
+
+    std::vector<size_t> neg_dims_idx;
+    // set some dimension to -1 if it is unknown
+    const int unknown_size = -1;
+    for (size_t i = 0; i < shape.size(); ++i) {
+      PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size,
+                     "Each dimension of Attr(shape) must be positive or %d.",
+                     unknown_size);
+      if (shape[i] == unknown_size) {
+        neg_dims_idx.push_back(i);
+        PADDLE_ENFORCE(neg_dims_idx.size() <= 1,
+                       "Only one dimension of Attr(shape) can be unknown.");
+      }
     }
-    // capacity check
+
     int64_t capacity =
         std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
     int64_t in_size = framework::product(x_dims);
-    PADDLE_ENFORCE_EQ(capacity, in_size,
-                      "The size of Input(X) mismatches with Attr(shape).");
+    if (neg_dims_idx.size() == 1) {
+      // dim infer
+      shape[neg_dims_idx[0]] = in_size / (-capacity);
+      // recalculate capacity
+      capacity = shape[neg_dims_idx[0]] * (-capacity);
+    }
+    // capacity check
+    PADDLE_ENFORCE(capacity == in_size,
+                   "The size of Input(X) mismatches with Attr(shape).");
     // resize output
     std::vector<int64_t> shape_int64(shape.size(), 0);
     std::transform(shape.begin(), shape.end(), shape_int64.begin(),
@@ -65,8 +77,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
 
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReshapeOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of reshape operator.");
     AddOutput("Out", "The output tensor of reshape operator.");
@@ -88,6 +99,9 @@ the tensor X into a 2-D tensor:
 
     [[1, 2, 3, 4]]
 
+One dimension in the target shape can be set -1, representing that its
+size is unknown. In this case, the real dimension will be infered from 
+the original shape of Input(X) and other dimensions in the target shape.
 )DOC");
   }
 };
diff --git a/paddle/operators/reshape_op.cu b/paddle/operators/reshape_op.cu
index b7329238c0ea8ebb374d35bd7cddced3dfee1a2c..f487e43b99d5be2af299a9edd91dcda0c4eb7b99 100644
--- a/paddle/operators/reshape_op.cu
+++ b/paddle/operators/reshape_op.cu
@@ -1,22 +1,22 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-       http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/reshape_op.h"
 
 REGISTER_OP_CUDA_KERNEL(
     reshape,
-    paddle::operators::ReshapeKernel<paddle::platform::GPUPlace, float>);
+    paddle::operators::ReshapeKernel<paddle::platform::CUDAPlace, float>);
 REGISTER_OP_CUDA_KERNEL(
     reshape_grad,
-    paddle::operators::ReshapeGradKernel<paddle::platform::GPUPlace, float>);
+    paddle::operators::ReshapeGradKernel<paddle::platform::CUDAPlace, float>);
diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h
index 92d8cbbb56e224fe67e630bdfcb16d7df44f2846..a4eb34a0ad1230b6257cd299c8ed563acb054367 100644
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-       http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/rmsprop_op.cc b/paddle/operators/rmsprop_op.cc
index fc3f9b8988ec7fe0093ef6b09a105747b0025ec1..f7c250bf913b9213e7d7e2cca9ecadf74cac91a1 100644
--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
@@ -63,8 +63,7 @@ class RmspropOp : public framework::OperatorWithKernel {
 
 class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RmspropOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  RmspropOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
diff --git a/paddle/operators/rmsprop_op.cu b/paddle/operators/rmsprop_op.cu
index 2a9fd6e1044e923b9ccffab834ff64df0f7cf5d7..0295dc262f095a2b58ab34e3c2ec9f5440e4bfca 100644
--- a/paddle/operators/rmsprop_op.cu
+++ b/paddle/operators/rmsprop_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/rmsprop_op.h"
diff --git a/paddle/operators/rnn_memory_helper_op.cc b/paddle/operators/rnn_memory_helper_op.cc
index 3a035f0b9acb94bab60659938e11b4996b8eaa0f..eb55ed6a05b51d7a6c63d16fcf5aff73f6744903 100644
--- a/paddle/operators/rnn_memory_helper_op.cc
+++ b/paddle/operators/rnn_memory_helper_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
@@ -25,7 +25,7 @@ class RNNMemoryHelperOp : public framework::OperatorBase {
                     const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto mem_var_name = Input("X");
     auto *mem_var = scope.FindVar(mem_var_name);
     PADDLE_ENFORCE(mem_var != nullptr,
@@ -57,15 +57,14 @@ class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase {
 
 class RNNMemoryHelperOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RNNMemoryHelperOpInfoMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+  RNNMemoryHelperOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "");
     AddOutput("Out", "");
     AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
     AddComment("");
   }
 };
@@ -78,7 +77,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
                         const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto out_grad_var_name = Input(framework::GradVarName("Out"));
     auto *out_grad_var = scope.FindVar(out_grad_var_name);
 
@@ -101,7 +100,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
 
       auto zero_op = framework::OpRegistry::CreateOp(
           "fill_constant", {}, {{"Out", {in_grad_var_name}}}, attrs);
-      zero_op->Run(scope, dev_ctx);
+      zero_op->Run(scope, dev_place);
     } else {
       auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>();
       auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>();
@@ -114,8 +113,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
 class RNNMemoryHelperGradOpInfoMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  RNNMemoryHelperGradOpInfoMaker(framework::OpProto *proto,
-                                 framework::OpAttrChecker *op_checker)
+  RNNMemoryHelperGradOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(framework::GradVarName("Out"), "");
     AddInput("X", "");
@@ -124,7 +122,7 @@ class RNNMemoryHelperGradOpInfoMaker
     AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
     AddComment("");
   }
 };
diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc
index 75fcea8401fbbc2943c0d6a50ca81288268823d8..ef1804d9762200686ac8537140af046c21443779 100644
--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
@@ -68,7 +68,7 @@ class ROIPoolOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
@@ -89,7 +89,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
@@ -99,8 +99,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel {
 
 class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ROIPoolOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  ROIPoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor), "
diff --git a/paddle/operators/row_conv_op.cc b/paddle/operators/row_conv_op.cc
index 5203a5079c8b125f8dc156202f70ce76711a1e30..68f4e3531566fd346055404d45651c2b53ebe31b 100644
--- a/paddle/operators/row_conv_op.cc
+++ b/paddle/operators/row_conv_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/row_conv_op.h"
 #include "paddle/framework/eigen.h"
@@ -76,8 +76,7 @@ class RowConvGradOp : public framework::OperatorWithKernel {
 
 class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RowConvOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  RowConvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(LoDTensor), the input(X) is a LodTensor, which supports "
diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu
index 56a98ff299e8263179306756631949761e386f70..41f2c5b9de91ade15b4010f56377675cfd1b611c 100644
--- a/paddle/operators/row_conv_op.cu
+++ b/paddle/operators/row_conv_op.cu
@@ -1,16 +1,16 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/row_conv_op.h"
diff --git a/paddle/operators/row_conv_op.h b/paddle/operators/row_conv_op.h
index 80912ad8f73b3581efa9e263427e99304208d581..10d435ab080851713ee08a491c43aad1549f6fbb 100644
--- a/paddle/operators/row_conv_op.h
+++ b/paddle/operators/row_conv_op.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/op_registry.h"
diff --git a/paddle/operators/save_load_op_test.cc b/paddle/operators/save_load_op_test.cc
index a57466a48d4d6016fe2618d19fdca4c4f667124a..40103d864fb58804b39ca5f3c63e802a430ce886 100644
--- a/paddle/operators/save_load_op_test.cc
+++ b/paddle/operators/save_load_op_test.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "gtest/gtest.h"
 #include "paddle/framework/op_registry.h"
@@ -21,7 +21,7 @@ USE_NO_KERNEL_OP(load);
 TEST(SaveLoadOp, CPU) {
   paddle::framework::Scope scope;
   paddle::platform::CPUPlace place;
-  paddle::platform::CPUDeviceContext ctx(place);
+
   auto var = scope.Var("test_var");
   auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
   tensor->Resize({10, 10});
@@ -42,13 +42,13 @@ TEST(SaveLoadOp, CPU) {
 
   auto save_op = paddle::framework::OpRegistry::CreateOp(
       "save", {{"X", {"test_var"}}}, {}, attrs);
-  save_op->Run(scope, ctx);
+  save_op->Run(scope, place);
 
   auto load_var = scope.Var("out_var");
   auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
   auto load_op = paddle::framework::OpRegistry::CreateOp(
       "load", {}, {{"Out", {"out_var"}}}, attrs);
-  load_op->Run(scope, ctx);
+  load_op->Run(scope, place);
   int* actual = target->data<int>();
   for (int64_t i = 0; i < tensor->numel(); ++i) {
     EXPECT_EQ(expect[i], actual[i]);
diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc
index d4921cb80c8d78c52ae1887c36819b52621470eb..4b1cbe88836e340c94f797806243a6768410ed3d 100644
--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <stdint.h>
 #include <sys/stat.h>
@@ -21,6 +21,7 @@
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -62,7 +63,7 @@ class SaveOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     auto overwrite = Attr<bool>("overwrite");
 
@@ -88,14 +89,18 @@ class SaveOp : public framework::OperatorBase {
                    "SaveOp only support LoDTensor, %s has wrong type", iname);
 
     auto &tensor = var->Get<framework::LoDTensor>();
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
     framework::SerializeToStream(fout, tensor, dev_ctx);
   }
 };
 
 class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SaveOpProtoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  SaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor ) Input tensor to be saved");
     AddComment(R"DOC(
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index d848be823e602e595f66138f4b5dfd6e38dd85a1..f634ebe9a2a4648bd08f00af635ef22e8d86a8de 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/scale_op.h"
 #include "paddle/operators/net_op.h"
@@ -38,7 +38,7 @@ class ScaleOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) Input tensor of scale operator.");
     AddOutput("Out", "(Tensor) Output tensor of scale operator.");
@@ -58,13 +58,13 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
  public:
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("scale");
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttr("scale", GetAttr("scale"));
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/scale_op.cu b/paddle/operators/scale_op.cu
index 0c7980430f31e2720c7af97aa14cf146c7dfc009..7202c0de707ff0b0b3ad966d9d1d3a7c0a89e880 100644
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/scale_op.h"
 
diff --git a/paddle/operators/scale_op.h b/paddle/operators/scale_op.h
index 02a8c97a83f5b6f95bbd4079c453dfdc7b7c1481..395268c2eee40c187f5d211317ca8b28d35a71e0 100644
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/scatter.cu.h b/paddle/operators/scatter.cu.h
index d95436be4f25b9df4aaef57ddb249ecf944f0666..55555300fc3219c0651583d8540b47189c8d3f13 100644
--- a/paddle/operators/scatter.cu.h
+++ b/paddle/operators/scatter.cu.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/tensor.h"
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index 573bbcd1875c86a2d843b6c5e9c1af4d48a5cb18..806dccc6ca78bf64da828fe13e08e043097bd939 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -49,7 +49,7 @@ class ScatterOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
@@ -68,7 +68,7 @@ class ScatterGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
@@ -78,8 +78,7 @@ class ScatterGradOp : public framework::OperatorWithKernel {
 
 class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ScatterOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Ref", "The source input of scatter op");
     AddInput("Index",
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
index 6b43a1389f98bf268cb3b70d7e61409f361e0063..0c198d225890882ab6697d3a8b3d17e034c06cc4 100644
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "gather.cu.h"
 #include "paddle/operators/gather_op.h"
diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc
index a3059847f2d420359b347e3a5d514d8a3829a4e2..95c207221a7b34732eca4cfd07fed0a8f1671981 100644
--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <ostream>
 
@@ -34,45 +34,62 @@ class SendOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {
     // init client when the operator is created at runtime.
-    if (!client_) {
-      std::string endpoint = Attr<std::string>("endpoint");
-      client_.reset(new detail::RPCClient(
-          grpc::CreateChannel(endpoint, grpc::InsecureChannelCredentials())));
-      // TODO(typhoonzero): how to call InitVariables
+    std::vector<std::string> endpoints =
+        Attr<std::vector<std::string>>("endpoints");
+    for (auto ep : endpoints) {
+      client_map_[ep].reset(new detail::RPCClient(
+          grpc::CreateChannel(ep, grpc::InsecureChannelCredentials())));
     }
   }
+
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
-    auto iname = Input("X");
-    auto oname = Output("Out");
-    // TODO(typhoonzero): currently it's non-blocking,
-    // should block until server responds.
-    bool ret = client_->SendVariable(scope, iname, oname);
-    if (!ret) {
-      LOG(ERROR) << "send variable error";
+           const platform::Place &dev_place) const override {
+    auto ins = Inputs("X");
+    auto outs = Outputs("Out");
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    // TODO(typhoonzero): use async calls to send multiple variable asyncly.
+    for (size_t i = 0; i < ins.size(); ++i) {
+      bool ret = client_map_[epmap[i]]->SendVariable(scope, ins[i]);
+      if (!ret) {
+        LOG(ERROR) << "send variable error: " << ins[i];
+      }
+    }
+    // TODO(typhoonzero): support async optimization
+    client_map_[epmap[0]]->Wait();
+    for (size_t i = 0; i < outs.size(); ++i) {
+      bool ret = client_map_[epmap[i]]->GetVariable(scope, outs[i]);
+      if (!ret) {
+        LOG(ERROR) << "GetVariable error: " << outs[i];
+      }
     }
   }
 
  protected:
-  std::shared_ptr<detail::RPCClient> client_{nullptr};
+  mutable std::unordered_map<std::string, std::shared_ptr<detail::RPCClient>>
+      client_map_;
 };
 
 class SendOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SendOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SendOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) Input tensor to be saved");
-    AddOutput("Out", "(Tensor) Output fetched from server");
+    AddInput("X", "(Tensor) Input tensor to be send").AsDuplicable();
+    AddOutput("Out", "(Tensor) Output tensor to get from server")
+        .AsDuplicable();
     AddComment(R"DOC(
 Recv operator
 
-This operator will recv tensor from send_op
+This operator will send tensor to recv_op.
 )DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string, default 127.0.0.1:6164)"
-                         "IP address to listen on.")
-        .SetDefault("127.0.0.1:6164")
-        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>("epmap",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints in the order of input "
+                                      "variables for mapping")
+        .SetDefault({});
   }
 };
 
diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc
index 3e2e2051afacb748877e3b0c3dec8d6662ac4e72..108e2dec6b3eecadd431fd25f9a31ec17a874b6b 100644
--- a/paddle/operators/send_recv_op_test.cc
+++ b/paddle/operators/send_recv_op_test.cc
@@ -1,27 +1,26 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-// TODO(typhoonzero): add python bindings for this test as
-// a RemoteOptimizer.
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <unistd.h>
+#include <string>
 #include <thread>
 
 #include "gtest/gtest.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/program_desc.h"
+#include "paddle/string/printf.h"
 
 USE_NO_KERNEL_OP(send);
 USE_NO_KERNEL_OP(recv);
@@ -33,30 +32,33 @@ std::unique_ptr<paddle::framework::OperatorBase> recv_op;
 void InitTensorsInScope(paddle::framework::Scope &scope,
                         paddle::platform::CPUPlace &place) {
   paddle::platform::CPUDeviceContext ctx(place);
-  auto var = scope.Var("X");
-  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({10, 10});
-  float *expect = tensor->mutable_data<float>(place);
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    expect[i] = static_cast<float>(i);
+  for (int i = 0; i < 2; ++i) {
+    auto var_name = paddle::string::Sprintf("x%d", i);
+    auto var = scope.Var(var_name);
+    auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+    tensor->Resize({10, 10});
+    float *expect = tensor->mutable_data<float>(place);
+    for (int64_t i = 0; i < tensor->numel(); ++i) {
+      expect[i] = static_cast<float>(i);
+    }
   }
 
   auto out_var = scope.Var("Out");
   auto out_tensor = out_var->GetMutable<paddle::framework::LoDTensor>();
   out_tensor->Resize({10, 10});
-  tensor->mutable_data<float>(place);  // allocate
+  out_tensor->mutable_data<float>(place);  // allocate
 }
 
 void AddOp(const std::string &type,
            const paddle::framework::VariableNameMap &inputs,
            const paddle::framework::VariableNameMap &outputs,
            paddle::framework::AttributeMap attrs,
-           paddle::framework::BlockDescBind *block) {
+           paddle::framework::BlockDesc *block) {
   // insert output
   for (auto kv : outputs) {
     for (auto v : kv.second) {
       auto var = block->Var(v);
-      var->SetDataType(paddle::framework::DataType::FP32);
+      var->SetDataType(paddle::framework::proto::DataType::FP32);
     }
   }
 
@@ -78,21 +80,22 @@ void StartServerNet() {
   InitTensorsInScope(scope, place);
 
   // sub program run in recv_op, for simple test we use sum
-  paddle::framework::ProgramDescBind program;
-  paddle::framework::BlockDescBind *block = program.MutableBlock(0);
+  paddle::framework::ProgramDesc program;
+  paddle::framework::BlockDesc *block = program.MutableBlock(0);
   // X for server side tensors, RX for received tensers, must be of same shape.
-  AddOp("sum", {{"X", {"X", "RX"}}}, {{"Out", {"Out"}}}, {}, block);
+  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"x0"}}}, {}, block);
 
   paddle::framework::AttributeMap attrs;
   attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+  attrs.insert({"ParamList", std::vector<std::string>({"x0"})});
+  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
   std::string program_proto;
   PADDLE_ENFORCE(program.Proto()->SerializeToString(&program_proto));
 
   attrs.insert({"OptimizeProgram", program_proto});
-  recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}},
-                                                    {{"Out", {"Out"}}}, attrs);
-  paddle::platform::CPUDeviceContext ctx(place);
-  recv_op->Run(scope, ctx);
+  recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"x1"}}},
+                                                    {}, attrs);
+  recv_op->Run(scope, place);
 }
 
 TEST(SendRecvOp, CPU) {
@@ -104,25 +107,25 @@ TEST(SendRecvOp, CPU) {
   InitTensorsInScope(scope, place);
 
   paddle::framework::AttributeMap attrs;
-  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
-
+  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
+  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
   auto send_op = paddle::framework::OpRegistry::CreateOp(
-      "send", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-  paddle::platform::CPUDeviceContext ctx(place);
-  send_op->Run(scope, ctx);
+      "send", {{"X", {"x1"}}}, {{"Out", {"x0"}}}, attrs);
+  send_op->Run(scope, place);
 
-  auto in_var = scope.Var("X");
+  auto in_var = scope.Var("x1");
   auto tensor = in_var->GetMutable<paddle::framework::LoDTensor>();
   float *expected = tensor->data<float>();
-
-  auto out_var = scope.Var("Out");
+  auto out_var = scope.Var("x0");
   auto target = out_var->GetMutable<paddle::framework::LoDTensor>();
-  // send fail cause output is none.
+  // x1 * 2 == x0
   EXPECT_NE(target->memory_size(), size_t(0));
   float *actual = target->data<float>();
   for (int64_t i = 0; i < target->numel(); ++i) {
     EXPECT_EQ(expected[i] * 2, actual[i]);
   }
-  recv_op.reset();  // dtor can shutdown and join server thread.
+
+  recv_op->Stop();
   server_thread.join();
+  // recv_op.reset();
 }
diff --git a/paddle/operators/seq_expand_op.cu b/paddle/operators/seq_expand_op.cu
deleted file mode 100644
index 8e67ce9ccb29497a957508a9ecdc6b810a7de543..0000000000000000000000000000000000000000
--- a/paddle/operators/seq_expand_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/seq_expand_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    seq_expand,
-    ops::SeqExpandKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    seq_expand_grad,
-    ops::SeqExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc
index 9c7e5456e8238af70f920aaaa9cc652d5d12d3e9..2f0aad2003e48952ca26ca27573bc45386a4e585 100644
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -43,8 +43,7 @@ class SequenceConcatOp : public framework::OperatorWithKernel {
 
 class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceConcatOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  SequenceConcatOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(LodTensorArray) Input is a vector of LoDTensor, "
@@ -68,12 +67,12 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The level should be less than the level number of inputs.")
         .SetDefault(0);
     AddComment(R"DOC(
-The sequence_concat operator concatenates multiple LoDTensors. 
-It only supports sequence (LoD Tensor with level number is 1) 
+The sequence_concat operator concatenates multiple LoDTensors.
+It only supports sequence (LoD Tensor with level number is 1)
 or a nested sequence (LoD tensor with level number is 2) as its input.
 - Case1:
   If the axis is other than 0(here, axis is 1 and level is 1),
-  each input should have the same LoD information and the LoD 
+  each input should have the same LoD information and the LoD
   information of the output keeps the same as the input.
 
   LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
@@ -81,7 +80,7 @@ or a nested sequence (LoD tensor with level number is 2) as its input.
   LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4)
 
 - Case2:
-  If the axis is 0(here, leve is 0), the inputs are concatenated along 
+  If the axis is 0(here, leve is 0), the inputs are concatenated along
   time steps, the LoD information of the output need to re-compute.
   The LoD information of level-1 should be same.
 
@@ -125,8 +124,9 @@ class SequenceConcatGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_concat, ops::SequenceConcatOp, ops::SequenceConcatOpMaker,
-            sequence_concat_grad, ops::SequenceConcatGradOp);
+REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp,
+               ops::SequenceConcatOpMaker, sequence_concat_grad,
+               ops::SequenceConcatGradOp, false);
 REGISTER_OP_CPU_KERNEL(
     sequence_concat,
     ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/operators/sequence_conv_op.cc
index f5c4f1c13331f45183d2810a95f773ad52aca13b..c5b7c81bd7c6e1110aa9e2ced629bea5d88832d1 100644
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
@@ -100,8 +100,7 @@ class SequenceConvGradOp : public framework::OperatorWithKernel {
 
 class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceConvOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  SequenceConvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
diff --git a/paddle/operators/sequence_conv_op.cu.cc b/paddle/operators/sequence_conv_op.cu.cc
index eacba79ace3e60a408d5f5e21a6fe2658da56ca7..0b8f2c695564f19cf71ecc56a60e707c3703af36 100644
--- a/paddle/operators/sequence_conv_op.cu.cc
+++ b/paddle/operators/sequence_conv_op.cu.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/sequence_conv_op.h"
 
diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/sequence_expand_op.cc
similarity index 72%
rename from paddle/operators/seq_expand_op.cc
rename to paddle/operators/sequence_expand_op.cc
index ede9754697429a4d24c51cf494b0ea8f4e408b44..b40ec617e42110e0ab5168a8ac675adaf760fb3c 100644
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/sequence_expand_op.cc
@@ -1,25 +1,25 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
-#include "paddle/operators/seq_expand_op.h"
+#include "paddle/operators/sequence_expand_op.h"
 
 namespace paddle {
 namespace operators {
 
 using framework::Tensor;
 
-class SeqExpandOp : public framework::OperatorWithKernel {
+class SequenceExpandOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -35,25 +35,24 @@ class SeqExpandOp : public framework::OperatorWithKernel {
   }
 };
 
-class SeqExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SeqExpandOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  SequenceExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor or LoDTensor) The input(X) of this operator can be a "
              "LoDTensor or a base Tensor.");
     AddInput("Y",
-             "(LoDTensor)The reference input(Y) of seq_expand op."
+             "(LoDTensor)The reference input(Y) of sequence_expand op."
              "It must be a LoDTensor with k-level(k>0)."
              "The input(X) will be expanded according to LOD of input(Y)."
              "The element numbers of last level in input(Y) "
              "must be equal to dims[0] of input(X).");
     AddOutput("Out",
-              "(LodTensor)The output of seq_expand op."
+              "(LodTensor)The output of sequence_expand op."
               "The lod of output will be as same as input(Y)'s lod.");
     AddComment(R"DOC(
-Seq Expand Operator.
+Sequence Expand Operator.
 
 This operator expands input(X) according to LOD of input(Y).
 Following are cases to better explain how this works:
@@ -124,7 +123,7 @@ then we get 2-level LoDTensor
   }
 };
 
-class SeqExpandOpGrad : public framework::OperatorWithKernel {
+class SequenceExpandOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -146,11 +145,11 @@ class SeqExpandOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(seq_expand, ops::SeqExpandOp, ops::SeqExpandOpMaker,
-            seq_expand_grad, ops::SeqExpandOpGrad);
+REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker,
+            sequence_expand_grad, ops::SequenceExpandOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    seq_expand,
-    ops::SeqExpandKernel<paddle::platform::CPUDeviceContext, float>);
+    sequence_expand,
+    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
-    seq_expand_grad,
-    ops::SeqExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
+    sequence_expand_grad,
+    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_expand_op.cu b/paddle/operators/sequence_expand_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0b9638b2ce60f73c95e1ccbcfb16cef7b5351073
--- /dev/null
+++ b/paddle/operators/sequence_expand_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/sequence_expand_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_expand,
+    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_expand_grad,
+    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/sequence_expand_op.h
similarity index 83%
rename from paddle/operators/seq_expand_op.h
rename to paddle/operators/sequence_expand_op.h
index fbee0db454f9701e3f58a41008efd24e728d0600..2ba628e9c37278025e31779ab0468db46f2ff40a 100644
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/sequence_expand_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -24,7 +24,7 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 
 template <typename DeviceContext, typename T>
-class SeqExpandKernel : public framework::OpKernel<T> {
+class SequenceExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<LoDTensor>("X");
@@ -71,7 +71,7 @@ class SeqExpandKernel : public framework::OpKernel<T> {
  *
  * */
 template <typename DeviceContext, typename T>
-class SeqExpandGradKernel : public framework::OpKernel<T> {
+class SequenceExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
index 3526e45a1b6565bc21413d381d15c02f08c587bd..aea98744d8fc1fc59a07250d57f76f26fb9f3634 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -37,8 +37,7 @@ class SequencePoolOp : public framework::OperatorWithKernel {
 
 class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequencePoolOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  SequencePoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
     AddOutput("Out",
@@ -50,7 +49,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsIntermediate();
     AddAttr<std::string>(
         "pooltype",
-        "(int, default AVERAGE) the pooling pooltype of SequencePoolOp.")
+        "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")
         .SetDefault("AVERAGE")
         .InEnum({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"});
     AddComment(R"DOC(
@@ -108,7 +107,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
diff --git a/paddle/operators/sequence_pool_op.cu b/paddle/operators/sequence_pool_op.cu
index fcd65084353744dc836ff1dc5a3aa4b03a205130..265f695935236236f98c2dd2062072756e9c8b14 100644
--- a/paddle/operators/sequence_pool_op.cu
+++ b/paddle/operators/sequence_pool_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 
diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc
index 481db8f9e548de68c102210035d4ff037ab56261..98bd8854903e5abf6d27432a2af0aaae980c0b1d 100644
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
@@ -48,7 +48,7 @@ class SequenceSliceOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
@@ -69,7 +69,7 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
@@ -79,8 +79,7 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {
 
 class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceSliceOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+  SequenceSliceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(LoDTensor), "
diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/operators/sequence_softmax_op.cc
index 37d5452e6ba59411f9ab2e1460fc8584583f0321..b74766f012e333cc2a317e6efe17c5b60238924a 100644
--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
@@ -33,8 +33,7 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
 
 class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceSoftmaxOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker)
+  SequenceSoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(LoDTensor) 1-D or 2-D input LoDTensor with the 2-nd dimension "
@@ -51,10 +50,14 @@ input Tensor can be either [N, 1] or [N], where N is the sum of the length
 of all sequences.
 
 The algorithm works as follows:
+
     for i-th sequence in a mini-batch:
-        $$Out(X[lod[i]:lod[i+1]], :) =
-            \frac{\exp(X[lod[i]:lod[i+1], :])}
-            {\sum(\exp(X[lod[i]:lod[i+1], :]))}$$
+
+$$
+Out(X[lod[i]:lod[i+1]], :) = \
+\frac{\exp(X[lod[i]:lod[i+1], :])} \
+{\sum(\exp(X[lod[i]:lod[i+1], :]))}
+$$
 
 For example, for a mini-batch of 3 sequences with variable-length,
 each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 121bf60b27c62c1b0dd4c34c12962b7098e29ae2..a11c9624ce5e8485449dd6b420ad1f23ff3550c7 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -43,7 +43,7 @@ class SGDOp : public framework::OperatorWithKernel {
 
 class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SGDOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("LearningRate", "(Tensor) Learning rate of SGD");
@@ -61,43 +61,9 @@ $$param\_out = param - learning\_rate * grad$$
   }
 };
 
-template <typename T>
-struct SparseSGDFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input,
-                  const framework::Tensor& learning_rate,
-                  framework::Tensor* output) {
-    auto in_height = input.height();
-    auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
-
-    auto& in_value = input.value();
-    auto& in_rows = input.rows();
-
-    int64_t in_row_numel = in_value.numel() / in_rows.size();
-    PADDLE_ENFORCE_EQ(in_row_numel, output->numel() / in_height);
-
-    auto* in_data = in_value.data<T>();
-    auto* out_data = output->data<T>();
-    auto* lr = learning_rate.data<T>();
-
-    for (size_t i = 0; i < in_rows.size(); i++) {
-      for (int64_t j = 0; j < in_row_numel; j++) {
-        out_data[in_rows[i] * in_row_numel + j] -=
-            lr[0] * in_data[i * in_row_numel + j];
-      }
-    }
-  }
-};
-
-template struct SparseSGDFunctor<platform::CPUDeviceContext, float>;
-template struct SparseSGDFunctor<platform::CPUDeviceContext, double>;
-
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    sgd, ops::SGDOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SGDOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<float>, ops::SGDOpKernel<double>);
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
index a3c0db7e50ecaabd6d4b83c43e5436e6be491676..42f8f8b2f072f9d204dfadcd732926b5c98dc617 100644
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/sgd_op.h"
@@ -20,6 +20,19 @@ namespace paddle {
 namespace operators {
 
 namespace {
+
+template <typename T>
+__global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
+                          const int num, T* p_out) {
+  T lr = learning_rate[0];
+  int grid_size = blockDim.x * gridDim.x;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += grid_size) {
+    T g_data = g[i];
+    T p_data = p[i];
+    p_out[i] = p_data - lr * g_data;
+  }
+}
+
 template <typename T, int block_size>
 __global__ void SparseSGDFunctorKernel(const T* selected_rows,
                                        const int64_t* rows,
@@ -41,40 +54,65 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
 }  // namespace
 
 template <typename T>
-struct SparseSGDFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& input,
-                  const framework::Tensor& learning_rate,
-                  framework::Tensor* output) {
-    auto in_height = input.height();
-    auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
-
-    auto& in_value = input.value();
-    auto& in_rows = input.rows();
-
-    int64_t in_row_numel = in_value.numel() / in_rows.size();
-    PADDLE_ENFORCE_EQ(in_row_numel, output->numel() / in_height);
-
-    auto* in_data = in_value.data<T>();
-    auto* out_data = output->data<T>();
-
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid(1, in_rows.size());
-    SparseSGDFunctorKernel<T, 256><<<grid, threads, 0, context.stream()>>>(
-        in_data, in_rows.data(), learning_rate.data<T>(), out_data,
-        in_row_numel);
+class SGDOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param = ctx.Input<framework::Tensor>("Param");
+    auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    auto* grad_var = ctx.InputVar("Grad");
+    // Actually, all tensors are LoDTensor except SelectedRows.
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      param_out->mutable_data<T>(ctx.GetPlace());
+      auto* grad = ctx.Input<framework::Tensor>("Grad");
+      auto* grad_data = grad->data<T>();
+      auto* param_data = param->data<T>();
+      auto* param_out_data = param_out->data<T>();
+
+      int block = 512;
+      int grid = (param->numel() + block - 1) / block;
+
+      SGDKernel<T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
+          grad_data, param_data, learning_rate->data<T>(), param->numel(),
+          param_out_data);
+
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
+      // This manual optimization brings difficulty to track data dependency.
+      // It's better to find a more elegant solution.
+      PADDLE_ENFORCE_EQ(param, param_out);
+      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+
+      auto in_height = grad->height();
+      auto out_dims = param_out->dims();
+      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
+
+      auto& in_value = grad->value();
+      auto& in_rows = grad->rows();
+
+      int64_t in_row_numel = in_value.numel() / in_rows.size();
+      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
+
+      auto* in_data = in_value.data<T>();
+      auto* out_data = param_out->data<T>();
+
+      const int block_size = 256;
+      dim3 threads(block_size, 1);
+      dim3 grid(1, in_rows.size());
+      SparseSGDFunctorKernel<
+          T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+          in_data, in_rows.data(), learning_rate->data<T>(), out_data,
+          in_row_numel);
+
+    } else {
+      PADDLE_THROW("Unsupported Variable Type of Grad");
+    }
   }
 };
-
-template struct SparseSGDFunctor<platform::CUDADeviceContext, float>;
-template struct SparseSGDFunctor<platform::CUDADeviceContext, double>;
-
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sgd, ops::SGDOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SGDOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(sgd, ops::SGDOpCUDAKernel<float>,
+                        ops::SGDOpCUDAKernel<double>);
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index c920025a91cd0b68019bcb05558398093f31e206..a6c544591e1172320f6cf7192bf640ff25225b99 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -20,15 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
-struct SparseSGDFunctor {
-  void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& input,
-                  const framework::Tensor& learning_rate,
-                  framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename T>
+template <typename T>
 class SGDOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -45,21 +37,36 @@ class SGDOpKernel : public framework::OpKernel<T> {
       auto p = framework::EigenVector<T>::Flatten(*param);
       auto g = framework::EigenVector<T>::Flatten(*grad);
       auto o = framework::EigenVector<T>::Flatten(*param_out);
-      auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
-      auto& place =
-          *ctx.template device_context<DeviceContext>().eigen_device();
+      auto* lr = learning_rate->data<T>();
 
-      Eigen::DSizes<int, 1> grad_dsize(grad->numel());
-      o.device(place) = p - lr.broadcast(grad_dsize) * g;
+      o = p - lr[0] * g;
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
       // This manual optimization brings difficulty to track data dependency.
       // It's better to find a more elegant solution.
       PADDLE_ENFORCE_EQ(param, param_out);
       auto* grad = ctx.Input<framework::SelectedRows>("Grad");
-      SparseSGDFunctor<DeviceContext, T> functor;
-      functor(ctx.template device_context<DeviceContext>(), *grad,
-              *learning_rate, param_out);
+
+      auto in_height = grad->height();
+      auto out_dims = param_out->dims();
+      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
+
+      auto& in_value = grad->value();
+      auto& in_rows = grad->rows();
+
+      int64_t in_row_numel = in_value.numel() / in_rows.size();
+      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
+
+      auto* in_data = in_value.data<T>();
+      auto* out_data = param_out->data<T>();
+      auto* lr = learning_rate->data<T>();
+
+      for (size_t i = 0; i < in_rows.size(); i++) {
+        for (int64_t j = 0; j < in_row_numel; j++) {
+          out_data[in_rows[i] * in_row_numel + j] -=
+              lr[0] * in_data[i * in_row_numel + j];
+        }
+      }
     } else {
       PADDLE_THROW("Unsupported Variable Type of Grad");
     }
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
index c380e606869fd2c559c7d5f378857ca74fa8d8d3..e5ef0740b6f385de7f17a3a419000cb8c897d986 100644
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/operators/array_operator.h"
 #include "paddle/operators/math/math_function.h"
@@ -27,11 +27,11 @@ class ShrinkRNNMemoryOp : public ArrayOp {
       : ArrayOp(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *x_var = scope.FindVar(Input("X"));
     PADDLE_ENFORCE(x_var != nullptr, "Input X must be set");
     auto &x_tensor = x_var->Get<framework::LoDTensor>();
-    size_t offset = this->GetOffset(scope, dev_ctx);
+    size_t offset = this->GetOffset(scope, place);
     auto *rank_table_var = scope.FindVar(Input("RankTable"));
     PADDLE_ENFORCE(rank_table_var != nullptr, "RankTable must be set");
     auto &rank_table = rank_table_var->Get<framework::LoDRankTable>();
@@ -54,8 +54,7 @@ class ShrinkRNNMemoryOp : public ArrayOp {
 
 class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ShrinkRNNMemoryOpProtoMaker(framework::OpProto *proto,
-                              framework::OpAttrChecker *op_checker)
+  ShrinkRNNMemoryOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
     AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
@@ -94,7 +93,7 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
       : ArrayOp(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out")));
     auto *dx_var = scope.FindVar(Output(framework::GradVarName("X")));
     PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr");
@@ -106,6 +105,10 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
     dx_tensor.Resize(x_tensor.dims());
     dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
 
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
     if (dout_var == nullptr) {  // dx_tensor fill zero
       math::set_constant(dev_ctx, &dx_tensor, 0.0f);
     } else {
@@ -137,14 +140,14 @@ class ShrinkRNNGradOpMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
     op->SetType("shrink_rnn_memory_grad");
     op->SetInput("X", Input("X"));
     op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(op);
+    return std::unique_ptr<framework::OpDesc>(op);
   }
 };
 
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
index b8a1bf122a78df1e0d8291c77a61b3f917d40960..c526a88a127da12a6384777bca31b60873844d94 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h"
 
@@ -86,8 +86,8 @@ class SigmoidCrossEntropyWithLogitsGradOp
 class SigmoidCrossEntropyWithLogitsOpMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  SigmoidCrossEntropyWithLogitsOpMaker(framework::OpProto* proto,
-                                       framework::OpAttrChecker* op_checker)
+  SigmoidCrossEntropyWithLogitsOpMaker(OpProto* proto,
+                                       OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
index 1b569c93ed9568a26824defef0d25bb1c3dadad4..3f393265f48b428dca8703ff77688de979fb63df 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h"
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
index 8fe7c5ba8224f8dac5de8d7ee772ebc71f987d69..b78bcc436e9fa5c5d4db3fbb22224e328c3bc3c2 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
diff --git a/paddle/operators/sign_op.cc b/paddle/operators/sign_op.cc
index d5a7ccb77e7d9ad3a93702861dbab295c4ab5bce..f63eaa4464cc668acdb8e5b8a74ad5bba936db44 100644
--- a/paddle/operators/sign_op.cc
+++ b/paddle/operators/sign_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/sign_op.h"
 
@@ -34,7 +34,7 @@ class SignOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class SignOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SignOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) Input tensor of sign operator.");
     AddOutput("Out", "(Tensor) Output tensor of sign operator.");
@@ -50,13 +50,13 @@ class SignGradMaker : public framework::SingleGradOpDescMaker {
  public:
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("scale");
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttr("scale", 0.0f);
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/sign_op.cu b/paddle/operators/sign_op.cu
index 9bc1c65d214ba8f988dec3b7b11da9e1ec3a6581..f224880cffb2154a7c46b8a4701d7357e67bb70c 100644
--- a/paddle/operators/sign_op.cu
+++ b/paddle/operators/sign_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/sign_op.h"
 
diff --git a/paddle/operators/sign_op.h b/paddle/operators/sign_op.h
index 2e476ed6658491b3dcec3cf1388ccc4a0813449c..9fe49ae1a2161d9f1472eef830c11b0f8305c568 100644
--- a/paddle/operators/sign_op.h
+++ b/paddle/operators/sign_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc
index 56e8d9058fcc035c28e74daff778c4e034f46b44..dcb18d729da69beaa556e4b93129dafb08b72c06 100644
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/smooth_l1_loss_op.h"
 
@@ -47,8 +47,7 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SmoothL1LossOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  SmoothL1LossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
diff --git a/paddle/operators/smooth_l1_loss_op.cu b/paddle/operators/smooth_l1_loss_op.cu
index 8e94ebac644d1047920827250c4313c657b22ea0..213429bc370ef0d5b493b3a448df1b3bf0e4e87c 100644
--- a/paddle/operators/smooth_l1_loss_op.cu
+++ b/paddle/operators/smooth_l1_loss_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 
diff --git a/paddle/operators/smooth_l1_loss_op.h b/paddle/operators/smooth_l1_loss_op.h
index 1a70c9c63c340d66b6bf0db97cc8ab35a663f816..3facfae116d711f86ea5c193562c20ea60a2efc9 100644
--- a/paddle/operators/smooth_l1_loss_op.h
+++ b/paddle/operators/smooth_l1_loss_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 0988c83d43535d7ee1bcef87bf506e5db1a3ecc0..e7306bc5f13377813e0bd49846bc834d501602eb 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -24,25 +24,24 @@ class SoftmaxOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of SoftmaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"),
-                   "Output(Y) of SoftmaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SoftmaxOp should not be null.");
 
     auto x_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE(x_dims.size() == 2UL,
                    "The input of softmax op must be a matrix.");
-    ctx->SetOutputDim("Y", x_dims);
+    ctx->SetOutputDim("Out", x_dims);
   }
 };
 
 class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftmaxOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input tensor of softmax. "
              "2-D with shape [batch_size, input_feature_dimensions].");
-    AddOutput("Y", "The normalized values with the same shape as X.");
+    AddOutput("Out", "The normalized values with the same shape as X.");
     AddComment(R"DOC(
 Softmax Operator.
 
@@ -60,7 +59,7 @@ exponential values of all the other dimensions is the output of the softmax
 operator.
 
 For each row $i$ and each column $j$ in Input(X), we have:
-    $$Y[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
+    $$Out[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
 
 )DOC");
   }
@@ -71,12 +70,12 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input(Y@GRAD) should be not null.");
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Y"),
-                      ctx->GetInputDim(framework::GradVarName("Y")),
-                      "Input(Y) and its gradients should have a same shape.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should be not null.");
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Out"),
+                      ctx->GetInputDim(framework::GradVarName("Out")),
+                      "Input(Out) and its gradients should have a same shape.");
 
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
diff --git a/paddle/operators/softmax_op.cu.cc b/paddle/operators/softmax_op.cu.cc
index 7b9882cbcfe1a0381541386f76867c6bb0f1fe55..e7da40f3e82d5db858a795e9634abf57b884d6a2 100644
--- a/paddle/operators/softmax_op.cu.cc
+++ b/paddle/operators/softmax_op.cu.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/softmax_op.h"
 
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 0f8998b99e93b5ed6c9b43ad7adabc2d515c1ff1..63e379a3b31a6c75aab0c56b4ce1b988fa7f0318 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -26,13 +26,13 @@ class SoftmaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<Tensor>("X");
-    auto* Y = context.Output<Tensor>("Y");
+    auto* Out = context.Output<Tensor>("Out");
 
     // allocate memory on device.
-    Y->mutable_data<T>(context.GetPlace());
+    Out->mutable_data<T>(context.GetPlace());
 
     math::SoftmaxFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), X, Y);
+        context.template device_context<DeviceContext>(), X, Out);
   }
 };
 
@@ -40,15 +40,15 @@ template <typename DeviceContext, typename T>
 class SoftmaxGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* Y = context.Input<Tensor>("Y");
-    auto* dY = context.Input<Tensor>(framework::GradVarName("Y"));
+    auto* Out = context.Input<Tensor>("Out");
+    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
     auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
 
     // allocate memory on device.
     dX->mutable_data<T>(context.GetPlace());
 
     math::SoftmaxGradFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), Y, dY, dX);
+        context.template device_context<DeviceContext>(), Out, dOut, dX);
   }
 };
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index 0c302288637ad1713e133d37faa0fb338e1f7022..41e65b701e62bd2e671f3590869a5d7fed90701c 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -1,10 +1,10 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -20,8 +20,7 @@ namespace operators {
 class SoftmaxWithCrossEntropyOpMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftmaxWithCrossEntropyOpMaker(framework::OpProto* proto,
-                                 framework::OpAttrChecker* op_checker)
+  SoftmaxWithCrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Logits",
              "(Tensor, default: Tensor<float>), The unscaled log probabilities "
@@ -119,7 +118,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
@@ -160,7 +159,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(
@@ -174,8 +173,8 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto* grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
     grad_op->SetType("softmax_with_cross_entropy_grad");
     grad_op->SetInput("Label", Input("Label"));
     grad_op->SetInput("Softmax", Output("Softmax"));
@@ -184,7 +183,7 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
     grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
     grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
index 6100c63f9aba006d9739173a8a5a2fb398187e55..61583c6161c3bbc62788dc8b6940ddcc29b2302a 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -1,10 +1,10 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
index 9c3431605b2f2285b2e7d71c5ff2f4a53c6c6f30..6bde0f37e06ccf7d81487e0c99227287787c5d72 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -1,10 +1,10 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc
index f164a4771186635232fea46327ca1fb8b86f2852..2d8787d740c70f1d4696fdec381b572ecf031f57 100644
--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -33,7 +34,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
     auto *out_true =
@@ -44,6 +45,9 @@ class SplitLoDTensorOp : public framework::OperatorBase {
     auto &x_lod = x.lod();
     auto &mask_dim = mask.dims();
 
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+
     std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
@@ -118,8 +122,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
 
 class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SplitLoDTensorOpProtoMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+  SplitLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input LoDTensor");
     AddInput("Mask", "A bool column vector which mask the input");
@@ -164,8 +167,8 @@ class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("merge_lod_tensor");
     grad_op->SetInput("InTrue", OutputGrad("OutTrue"));
     grad_op->SetInput("InFalse", OutputGrad("OutFalse"));
@@ -173,7 +176,7 @@ class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
     grad_op->SetInput("X", Input("X"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/split_op.cc b/paddle/operators/split_op.cc
index 275b25e96aa75fdbcb7275e272c49ea8d278d2c8..4dfae043cb1091c9491d89aec4d1415d4741e013 100644
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
@@ -65,7 +65,7 @@ class SplitOp : public framework::OperatorWithKernel {
 
 class SplitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SplitOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SplitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) Input tensor of the split operator.");
     AddOutput("Out", "(Tensor) Output tensors of the split operator.")
@@ -108,13 +108,13 @@ class SplitGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto op = new framework::OpDesc();
     op->SetType("concat");
     op->SetInput("X", OutputGrad("Out"));
     op->SetOutput("Out", InputGrad("X"));
     op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(op);
+    return std::unique_ptr<framework::OpDesc>(op);
   }
 };
 
diff --git a/paddle/operators/spp_op.cc b/paddle/operators/spp_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c0aa87b0f06ca9c7d156dfdf8df188da68ac1450
--- /dev/null
+++ b/paddle/operators/spp_op.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/spp_op.h"
+namespace paddle {
+namespace operators {
+
+class SppOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SppOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of spp operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of spp operator."
+              "N * M."
+              "M = C * H * W");
+    AddAttr<int>("pyramid_height", "(int), multi level pooling");
+    AddAttr<std::string>(
+        "pooling_type",
+        "(string), pooling type, can be \"max\" for max-pooling "
+        "and \"avg\" for average-pooling.")
+        .InEnum({"max", "avg"});
+    AddComment(R"DOC(
+        "With spatial pyramid pooling, the input image can
+        be of any sizes. This not only allows arbitrary aspect
+        ratios, but also allows arbitrary scales. We can resize
+        the input image to any scale (e.g., min(w, h)=180, 224,
+        ...) and apply the same deep network. When the
+        input image is at different scales, the network (with
+        the same filter sizes) will extract features at different
+        scales. The scales play important roles in traditional
+        methods.
+        Input shape: $(N, C_{in}, H_{in}, W_{in})$
+        Output shape: $(H_{out}, W_{out})$
+        Where
+          $$
+            H_{out} = N \\
+            W_{out} = (((4^pyramid_height) - 1) / (4 - 1))$ * C_{in}
+          $$
+        paper https://arxiv.org/pdf/1406.4729v4.pdf
+        )DOC");
+  }
+};
+
+class SppOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SppOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SppOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    int pyramid_height = ctx->Attrs().Get<int>("pyramid_height");
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
+                   "Spping intput must be of 4-dimensional.");
+    int outlen = ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1];
+    std::vector<int64_t> output_shape({in_x_dims[0], outlen});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class SppOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(spp, ops::SppOp, ops::SppOpMaker, spp_grad, ops::SppOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    spp, ops::SppKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SppKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    spp_grad, ops::SppGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SppGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/spp_op.cu.cc b/paddle/operators/spp_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..761e4d6c4a9639898ba548d56bed3c8817720c1b
--- /dev/null
+++ b/paddle/operators/spp_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/spp_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    spp, ops::SppKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SppKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    spp_grad, ops::SppGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SppGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/spp_op.h b/paddle/operators/spp_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f35b305d02c73bcae6e72b8afa5ce55148ea98b8
--- /dev/null
+++ b/paddle/operators/spp_op.h
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/pooling.h"
+#include "paddle/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class SppKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    int pyramid_height = context.template Attr<int>("pyramid_height");
+    std::string pooling_type =
+        context.template Attr<std::string>("pooling_type");
+    out->mutable_data<T>(context.GetPlace());
+    auto out_stride = framework::stride(out->dims());
+    int input_h = in_x->dims()[2];
+    int input_w = in_x->dims()[3];
+    size_t output_offset = 0;
+    for (int p = 0; p < pyramid_height; ++p) {
+      int bins = std::pow(2, p);
+      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
+      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
+      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
+      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
+      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
+      std::vector<int> strides({kernel_size_h, kernel_size_w});
+      std::vector<int> paddings({padding_h, padding_w});
+      // pooling output shape
+      framework::Tensor out_level;
+      std::vector<int64_t> output_shape_vec(
+          {in_x->dims()[0], in_x->dims()[1], bins, bins});
+      framework::DDim output_shape(framework::make_ddim(output_shape_vec));
+      out_level.mutable_data<T>(output_shape, context.GetPlace());
+      // pooling
+      if (pooling_type == "max") {
+        math::Pool2dFunctor<DeviceContext, math::MaxPool<T>, T> pool_forward;
+        math::MaxPool<T> max_process;
+        pool_forward(context.template device_context<DeviceContext>(), *in_x,
+                     kernel_size, strides, paddings, max_process, &out_level);
+      } else if (pooling_type == "avg") {
+        math::Pool2dFunctor<DeviceContext, math::AvgPool<T>, T> pool_forward;
+        math::AvgPool<T> avg_process;
+        pool_forward(context.template device_context<DeviceContext>(), *in_x,
+                     kernel_size, strides, paddings, avg_process, &out_level);
+      }
+      // flatten pooling output shape
+      int output_flatten_w = in_x->dims()[1] * bins * bins;
+      std::vector<int64_t> output_flatten_shape_vec(
+          {in_x->dims()[0], output_flatten_w});
+      framework::DDim output_flatten_shape(
+          framework::make_ddim(output_flatten_shape_vec));
+      out_level.Resize(output_flatten_shape);
+      // concat
+      auto out_level_stride = framework::stride(out_level.dims());
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out_level.data<T>(), out_level_stride, out_level.dims(),
+                       out_stride, out->data<T>() + output_offset);
+      output_offset += out_level.dims()[1] * out_level_stride[1];
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class SppGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    int pyramid_height = context.template Attr<int>("pyramid_height");
+    std::string pooling_type =
+        context.template Attr<std::string>("pooling_type");
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    in_x_grad->mutable_data<T>(context.GetPlace());
+    zero(device_ctx, in_x_grad, static_cast<T>(0));
+    auto out_stride = framework::stride(out->dims());
+    int input_h = in_x->dims()[2];
+    int input_w = in_x->dims()[3];
+    size_t out_offset = 0;
+    for (int p = 0; p < pyramid_height; ++p) {
+      int bins = std::pow(2, p);
+      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
+      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
+      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
+      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
+      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
+      std::vector<int> strides({kernel_size_h, kernel_size_w});
+      std::vector<int> paddings({padding_h, padding_w});
+      // split out and outgrad  ...  to flatten
+      framework::Tensor out_level;
+      framework::Tensor outgrad_level;
+      int out_flatten_w = in_x->dims()[1] * bins * bins;
+      std::vector<int64_t> out_flatten_shape_vec(
+          {in_x->dims()[0], out_flatten_w});
+      framework::DDim out_flatten_shape(
+          framework::make_ddim(out_flatten_shape_vec));
+      out_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
+      outgrad_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
+      auto flatten_stride = framework::stride(out_level.dims());
+      // memcpy
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out->data<T>() + out_offset, out_stride,
+                       out_level.dims(), flatten_stride, out_level.data<T>());
+
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out_grad->data<T>() + out_offset, out_stride,
+                       outgrad_level.dims(), flatten_stride,
+                       outgrad_level.data<T>());
+      out_offset += out_level.dims()[1] * out_stride[1];
+      // flatten backward to nchw
+
+      std::vector<int64_t> out_shape_vec({in_x->dims()[0], in_x->dims()[1]});
+      out_shape_vec.push_back(
+          (input_h - kernel_size_h + 2 * padding_h) / kernel_size_h + 1);
+      out_shape_vec.push_back(
+          (input_w - kernel_size_w + 2 * padding_w) / kernel_size_w + 1);
+      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+      out_level.ShareDataWith(out_level);
+      out_level.Resize(out_shape);
+      outgrad_level.ShareDataWith(outgrad_level);
+      outgrad_level.Resize(out_shape);
+      // pooling backward
+      if (pooling_type == "max") {
+        math::MaxPool2dGradFunctor<DeviceContext, T> pool2d_backward;
+        pool2d_backward(context.template device_context<DeviceContext>(), *in_x,
+                        *&out_level, *&outgrad_level, kernel_size, strides,
+                        paddings, in_x_grad);
+      } else if (pooling_type == "avg") {
+        math::Pool2dGradFunctor<DeviceContext, math::AvgPoolGrad<T>, T>
+            pool_backward;
+        math::AvgPoolGrad<T> avg_process;
+        pool_backward(context.template device_context<DeviceContext>(), *in_x,
+                      *&out_level, *&outgrad_level, kernel_size, strides,
+                      paddings, avg_process, in_x_grad);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
index 50bc6da196e642e3860874cfb883390dd2e93215..9e097176f3434e81e31f2ecf4093af47b654e816 100644
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -56,8 +56,7 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
 
 class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SquaredL2DistanceOpMaker(framework::OpProto* proto,
-                           framework::OpAttrChecker* op_checker)
+  SquaredL2DistanceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) Input of SquaredL2DistanceOp.");
     AddInput("Y", "(Tensor) Target of SquaredL2DistanceOp.");
diff --git a/paddle/operators/squared_l2_distance_op.cu b/paddle/operators/squared_l2_distance_op.cu
index ecc82ed1e49501b05e0cf54e5b44114db150a427..f2648dde5eb9c56aed3fad81521e6207dc2d973e 100644
--- a/paddle/operators/squared_l2_distance_op.cu
+++ b/paddle/operators/squared_l2_distance_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 
diff --git a/paddle/operators/squared_l2_norm_op.cc b/paddle/operators/squared_l2_norm_op.cc
index 3cff61a02f71fadf99f73787e2b2c179f7d441a8..6626bf0375548eac457f960105ce33e63e1a3706 100644
--- a/paddle/operators/squared_l2_norm_op.cc
+++ b/paddle/operators/squared_l2_norm_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/squared_l2_norm_op.h"
 
@@ -48,8 +48,7 @@ class SquaredL2NormGradOp : public framework::OperatorWithKernel {
 
 class SquaredL2NormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SquaredL2NormOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+  SquaredL2NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The input of squared_l2_norm op.");
     AddOutput("Out", "(Scalar) The output of squared_l2_norm op.");
diff --git a/paddle/operators/squared_l2_norm_op.cu b/paddle/operators/squared_l2_norm_op.cu
index 2d6567d090a96a43cbda203fb8176041d719e55f..b222113a8c82061bd841da440a714b66f6c1fb9c 100644
--- a/paddle/operators/squared_l2_norm_op.cu
+++ b/paddle/operators/squared_l2_norm_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/squared_l2_norm_op.h"
diff --git a/paddle/operators/squared_l2_norm_op.h b/paddle/operators/squared_l2_norm_op.h
index 0ced7e7d70ab3627a337d70890db6842ba0f7768..1ce26c775ed5700cee73d00f4c51d58a692a1152 100644
--- a/paddle/operators/squared_l2_norm_op.h
+++ b/paddle/operators/squared_l2_norm_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
diff --git a/paddle/operators/strided_memcpy.h b/paddle/operators/strided_memcpy.h
index c9dd80518424017d9834a2bf7aee14caa56c9d79..735cabcd973a3b5ea1ab8ab57091eae14e23b89b 100644
--- a/paddle/operators/strided_memcpy.h
+++ b/paddle/operators/strided_memcpy.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/operators/detail/strided_memcpy.h"
diff --git a/paddle/operators/strided_memcpy_test.cc b/paddle/operators/strided_memcpy_test.cc
index 68f064eaee5851333ddf9767b7138da83a28503d..06d81188558aad85c41c56ddefad3617d48da74c 100644
--- a/paddle/operators/strided_memcpy_test.cc
+++ b/paddle/operators/strided_memcpy_test.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/strided_memcpy.h"
 #include "gtest/gtest.h"
@@ -82,11 +82,13 @@ TEST(StridedMemcpy, GPUCrop) {
   };
   // clang-format on
 
-  platform::GPUPlace gpu0(0);
+  platform::CUDAPlace gpu0(0);
   platform::CPUPlace cpu;
 
+  platform::CUDADeviceContext ctx(gpu0);
+
   int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
+  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
 
   framework::DDim src_stride({5, 1});
 
@@ -96,7 +98,6 @@ TEST(StridedMemcpy, GPUCrop) {
   framework::DDim dst_dim({2, 2});
   framework::DDim dst_stride({2, 1});
 
-  platform::CUDADeviceContext ctx(gpu0);
   StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride,
                      gpu_dst);
 
@@ -120,11 +121,12 @@ TEST(StridedMemcpy, GPUConcat) {
   };
   // clang-format on
 
-  platform::GPUPlace gpu0(0);
+  platform::CUDAPlace gpu0(0);
   platform::CPUPlace cpu;
+  platform::CUDADeviceContext ctx(gpu0);
 
   int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
+  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
 
   int dst[8];
   int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
@@ -132,7 +134,6 @@ TEST(StridedMemcpy, GPUConcat) {
   framework::DDim src_stride({2, 1});
   framework::DDim dst_dim({2, 2});
   framework::DDim dst_stride({4, 1});
-  platform::CUDADeviceContext ctx(gpu0);
 
   StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
   StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride,
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index cd52672f78e3e5826e8dfff92fb8e4668c5c6dcd..b86e8266425ca094a51d224fd39ce33700057f13 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -29,7 +29,7 @@ class SumOp : public framework::OperatorWithKernel {
                    "Output(Out) of SumOp should not be null.");
     if (ctx->IsRuntime() &&
         ctx->GetOutputsVarType("Out")[0] ==
-            framework::VarDesc::LOD_TENSOR_ARRAY) {
+            framework::proto::VarDesc::LOD_TENSOR_ARRAY) {
       return;  // skip runtime infershape when is tensor array;
     }
 
@@ -53,7 +53,7 @@ class SumOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto x_vars = ctx.MultiInputVar("X");
     if (x_vars[0]->IsType<framework::LoDTensor>()) {
@@ -72,8 +72,8 @@ class SumOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_NE(dtype, -1,
                         "Sum operator should have at least one tensor");
 
-      return framework::OpKernelType(static_cast<framework::DataType>(dtype),
-                                     ctx.device_context());
+      return framework::OpKernelType(
+          static_cast<framework::proto::DataType>(dtype), ctx.device_context());
     } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
       return framework::OpKernelType(
           framework::ToDataType(
@@ -98,7 +98,7 @@ class SumOp : public framework::OperatorWithKernel {
 
 class SumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SumOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  SumOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
         .AsDuplicable();
@@ -106,8 +106,8 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Sum operator.
 
-This operators sums the input tensors. All the inputs can carry the 
-LoD (Level of Details) information. However, the output only shares 
+This operators sums the input tensors. All the inputs can carry the
+LoD (Level of Details) information. However, the output only shares
 the LoD information with the first input.
 )DOC");
   }
@@ -115,10 +115,10 @@ the LoD information with the first input.
 
 class SumOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind& op_desc,
-                  framework::BlockDescBind* block) const override {
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
     auto& inputs = op_desc.Input("X");
-    auto var_type = framework::VarDesc::SELECTED_ROWS;
+    auto var_type = framework::proto::VarDesc::SELECTED_ROWS;
 
     for (auto& name : op_desc.Input("X")) {
       VLOG(10) << name << " "
@@ -128,12 +128,12 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
     bool any_input_is_lod_tensor = std::any_of(
         inputs.begin(), inputs.end(), [block](const std::string& name) {
           return block->FindRecursiveOrCreateVar(name)->GetType() ==
-                 framework::VarDesc::LOD_TENSOR;
+                 framework::proto::VarDesc::LOD_TENSOR;
         });
 
     auto is_tensor_array = [block](const std::string& name) {
       return detail::Ref(block->FindRecursiveOrCreateVar(name)).GetType() ==
-             framework::VarDesc::LOD_TENSOR_ARRAY;
+             framework::proto::VarDesc::LOD_TENSOR_ARRAY;
     };
 
     bool any_input_is_tensor_array =
@@ -152,9 +152,9 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
         PADDLE_ENFORCE(all_inputs_are_tensor_array,
                        "Not all inputs are tensor array:\n%s", os.str());
       }
-      var_type = framework::VarDesc::LOD_TENSOR_ARRAY;
+      var_type = framework::proto::VarDesc::LOD_TENSOR_ARRAY;
     } else if (any_input_is_lod_tensor) {
-      var_type = framework::VarDesc::LOD_TENSOR;
+      var_type = framework::proto::VarDesc::LOD_TENSOR;
     }
 
     auto out_var_name = op_desc.Output("Out").front();
@@ -169,20 +169,19 @@ class SumGradMaker : public framework::GradOpDescMakerBase {
  public:
   using framework::GradOpDescMakerBase::GradOpDescMakerBase;
 
-  std::vector<std::unique_ptr<framework::OpDescBind>> operator()()
-      const override {
-    auto x_grads = InputGrad("X");
-    std::vector<std::unique_ptr<framework::OpDescBind>> grad_ops;
+  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
+    auto x_grads = InputGrad("X", false);
+    std::vector<std::unique_ptr<framework::OpDesc>> grad_ops;
     grad_ops.reserve(x_grads.size());
     auto og = OutputGrad("Out");
     std::transform(x_grads.begin(), x_grads.end(), std::back_inserter(grad_ops),
                    [&og](const std::string& x_grad) {
-                     auto* grad_op = new framework::OpDescBind();
+                     auto* grad_op = new framework::OpDesc();
                      grad_op->SetType("scale");
                      grad_op->SetInput("X", og);
                      grad_op->SetOutput("Out", {x_grad});
                      grad_op->SetAttr("scale", 1.0f);
-                     return std::unique_ptr<framework::OpDescBind>(grad_op);
+                     return std::unique_ptr<framework::OpDesc>(grad_op);
                    });
     return grad_ops;
   }
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
index 2835b84f75cad6c8fb01d02b93bb9ff79edb1088..53e38ec70336ca7f2d7c142e5fb1bbe427ab2957 100644
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include "paddle/operators/array_operator.h"
 #include "paddle/operators/detail/safe_ref.h"
 namespace paddle {
@@ -25,11 +25,11 @@ class WriteToArrayOp : public ArrayOp {
       : ArrayOp(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
     if (x == nullptr) return;
     auto &x_tensor = x->Get<framework::LoDTensor>();
-    size_t offset = GetOffset(scope, dev_ctx);
+    size_t offset = GetOffset(scope, place);
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
     if (offset >= out->size()) {
@@ -39,7 +39,12 @@ class WriteToArrayOp : public ArrayOp {
     }
     if (x_tensor.memory_size() > 0) {
       auto *out_tensor = &out->at(offset);
-      CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx, out_tensor);
+
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
+
+      CopyFrom(x_tensor, place, dev_ctx, out_tensor);
       out_tensor->set_lod(x_tensor.lod());
     } else {
       VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
@@ -51,8 +56,7 @@ class WriteToArrayOp : public ArrayOp {
 
 class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  WriteToArrayOpProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  WriteToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(LoDTensor) the tensor will be written to tensor array");
     AddInput(
@@ -97,14 +101,14 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
 
 class WriteToArrayInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind &op_desc,
-                  framework::BlockDescBind *block) const override {
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
     auto x_name = op_desc.Input("X")[0];
     auto out_name = op_desc.Output("Out")[0];
     VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
     auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name),
                             "Cannot found %s", out_name);
-    out.SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
+    out.SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
     auto *x = block->FindVarRecursive(x_name);
     if (x != nullptr) {
       out.SetDataType(x->GetDataType());
@@ -120,17 +124,19 @@ class ReadFromArrayOp : public ArrayOp {
                   const framework::AttributeMap &attrs)
       : ArrayOp(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
     PADDLE_ENFORCE(x != nullptr, "X must be set");
     auto &x_array = x->Get<framework::LoDTensorArray>();
     auto *out = scope.FindVar(Output("Out"));
     PADDLE_ENFORCE(out != nullptr, "Out must be set");
     auto *out_tensor = out->GetMutable<framework::LoDTensor>();
-    size_t offset = GetOffset(scope, dev_ctx);
+    size_t offset = GetOffset(scope, place);
     if (offset < x_array.size()) {
-      framework::CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx,
-                          out_tensor);
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
+      framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor);
       out_tensor->set_lod(x_array[offset].lod());
     } else {
       VLOG(10) << "offset " << offset << " >= " << x_array.size();
@@ -140,8 +146,7 @@ class ReadFromArrayOp : public ArrayOp {
 
 class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReadFromArrayProtoMaker(framework::OpProto *proto,
-                          framework::OpAttrChecker *op_checker)
+  ReadFromArrayProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(TensorArray) the array will be read from.");
     AddInput("I",
@@ -177,14 +182,14 @@ class WriteToArrayGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("read_from_array");
     grad_op->SetInput("I", Input("I"));
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
@@ -193,14 +198,14 @@ class ReadFromArrayGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("write_to_array");
     grad_op->SetInput("I", Input("I"));
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/top_k_op.cc b/paddle/operators/top_k_op.cc
index 16ae925eb5cab1c05f3bc376972cabadc4367d20..bb72210bb67f925af3e450961069f0737dbde35e 100644
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
@@ -46,7 +46,7 @@ class TopkOp : public framework::OperatorWithKernel {
 
 class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TopkOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  TopkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The input of Topk op");
     AddOutput("Out", "(Tensor) The output tensor of Topk op");
diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu
index 453bd07267e3a6e33211117368dd9aff10a9e23f..f7bf58e7218cb9b94526cb9346bc5f9aa971038a 100644
--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
 #include "paddle/platform/assert.h"
@@ -283,7 +283,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
     auto* input = ctx.Input<Tensor>("X");
     auto* output = ctx.Output<Tensor>("Out");
     auto* indices = ctx.Output<Tensor>("Indices");
diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc
index de5ff561add6183828f6bb4c44e30f6bb13079fa..11615d806a61b3525d2ed50f5ea5940e8d61c8f8 100644
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/transpose_op.h"
 
@@ -55,8 +55,7 @@ class TransposeOp : public framework::OperatorWithKernel {
 
 class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TransposeOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  TransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
@@ -71,18 +70,31 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
 Transpose Operator.
 
 The input tensor will be permuted according to the axis values given.
-The op functions similar to how numpy.transpose works in python.
+The op functions is similar to how numpy.transpose works in python.
+
 For example:
- >> input = numpy.arange(6).reshape((2,3))
- >> input
- array([[0, 1, 2],
-        [3, 4, 5]])
- >> axis = [1, 0]
- >> output = input.transpose(axis)
- >> output
- array([[0, 3],
-        [1, 4],
-		[2, 5]])
+
+    .. code-block:: text
+
+      input = numpy.arange(6).reshape((2,3))
+
+      the input is:
+
+      array([[0, 1, 2],
+             [3, 4, 5]])
+
+      given axis is:
+
+      [1, 0]
+
+      output = input.transpose(axis)
+
+      then the output is:
+
+      array([[0, 3],
+             [1, 4],
+             [2, 5]])
+
 So, given a input tensor of shape(N, C, H, W) and the axis is {0, 2, 3, 1},
 the output tensor shape will be (N, H, W, C)
 
diff --git a/paddle/operators/transpose_op.cu.cc b/paddle/operators/transpose_op.cu.cc
index 7d23f1493ec2d548438aeb2493fda8a4ff8c6e80..281c4468cc267c659befd238c9a286dd23eaf16d 100644
--- a/paddle/operators/transpose_op.cu.cc
+++ b/paddle/operators/transpose_op.cu.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/transpose_op.h"
 
diff --git a/paddle/operators/transpose_op.h b/paddle/operators/transpose_op.h
index d995271a6be3266e05c742ab18c34636da384e66..b9686a2db3f76dbb9b8ebdba5e243f5e5a3c571a 100644
--- a/paddle/operators/transpose_op.h
+++ b/paddle/operators/transpose_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 2a49ee471f67cda87415db0e1440a4992c0cd088..4d5dd86cb8103a76247913cc088db4cff6b6ff43 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
 
-#include <random>
-#include <type_traits>
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 
@@ -63,18 +63,17 @@ class UniformRandomOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
         ctx.GetPlace());
   }
 };
 
 class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  UniformRandomOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+  UniformRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Out", "(Tensor) The output tensor of uniform random op");
     AddComment(R"DOC(
@@ -100,7 +99,7 @@ uniform distribution.
                  "0 means use a seed generated by the system.")
         .SetDefault(0);
     AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu
index cfe9d293cff2108cf25749d0e78e2e86e6e198a5..719d0872a7cba55dc97c95a73a7f86614ab3f4b7 100644
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
 
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/counting_iterator.h>
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include "paddle/framework/op_registry.h"
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index 49df2a530cd0c5c13f08db4b1e7db62679081e9b..aeed9679b2a3cce744189809c223a5b0d581ccdd 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -18,8 +18,7 @@ namespace operators {
 
 class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Unpool2dOpMaker(framework::OpProto* proto,
-                  framework::OpAttrChecker* op_checker)
+  Unpool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
@@ -54,16 +53,14 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
         "(string), unpooling type, can be \"max\" for max-unpooling ")
         .InEnum({"max"});
     AddComment(R"DOC(
-        "Input shape: $(N, C_{in}, H_{in}, W_{in})$
-        Output shape: $(N, C_{out}, H_{out}, W_{out})$
-        Where
-          $$
-            H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
-            W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
-          $$
-        Paper: http://www.matthewzeiler.com/wp-content/uploads/2017
-        /07/iccv2011.pdf
-        )DOC");
+Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is:
+$(N, C_{out}, H_{out}, W_{out})$, where
+$$
+H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
+W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+$$
+Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf
+)DOC");
   }
 };
 
@@ -74,7 +71,7 @@ int OutputSize(int input_size, int ksize, int padding, int stride) {
 
 class UnpoolOp : public framework::OperatorWithKernel {
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
@@ -113,7 +110,7 @@ class UnpoolOp : public framework::OperatorWithKernel {
 
 class UnpoolOpGrad : public framework::OperatorWithKernel {
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
index 9a092a570ff1f3f529413cd44dff55147adbaadc..728ef6079465d57f54dab383aac5e2bb750fe113 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <vector>
 #include "paddle/framework/executor.h"
@@ -40,13 +40,14 @@ class WhileOp : public framework::OperatorBase {
       : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
     auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
     PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
 
-    framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    framework::Executor executor(dev_place);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
     auto *program = block->Program();
 
     auto step_scopes =
@@ -64,7 +65,7 @@ class WhileOp : public framework::OperatorBase {
 
 class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  WhileOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(kParameters,
              "A set of variables, which are required by operators inside the "
@@ -82,8 +83,8 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
               "(StepScopeVar) A vector of local scope, which size equals the "
               "step number of While Op. The i'th scope storages temporary "
               "variables generated in the i'th step.");
-    AddAttr<framework::BlockDescBind *>(kStepBlock,
-                                        "The step block inside WhileOp");
+    AddAttr<framework::BlockDesc *>(kStepBlock,
+                                    "The step block inside WhileOp");
     AddComment(R"DOC(
 )DOC");
   }
@@ -97,9 +98,9 @@ class WhileGradOp : public framework::OperatorBase {
       : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
-    framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+           const platform::Place &dev_place) const override {
+    framework::Executor executor(dev_place);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
 
     auto *step_scopes =
@@ -189,7 +190,7 @@ class WhileGradOp : public framework::OperatorBase {
             auto zero_op = framework::OpRegistry::CreateOp(
                 "fill_constant", framework::VariableNameMap{},
                 {{"Out", {pg_names[param_id]}}}, attrs);
-            zero_op->Run(scope, dev_ctx);
+            zero_op->Run(scope, dev_place);
           }
         }
 
@@ -197,7 +198,7 @@ class WhileGradOp : public framework::OperatorBase {
         auto sum_op = framework::OpRegistry::CreateOp(
             "sum", {{"X", {pg_names[param_id], new_inside_name}}},
             {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
-        sum_op->Run(cur_scope, dev_ctx);
+        sum_op->Run(cur_scope, dev_place);
         cur_scope.Rename(new_inside_name, inside_grad_name);
       }
     }
@@ -209,8 +210,8 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad = new framework::OpDesc();
     grad->SetType("while_grad");
     grad->SetInput(kParameters, Input(kParameters));
 
@@ -279,14 +280,14 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     // while operator could be renamed.
     grad->SetAttr("original_output_grad", extra_inputs_list);
 
-    return std::unique_ptr<framework::OpDescBind>(grad);
+    return std::unique_ptr<framework::OpDesc>(grad);
   }
 };
 
 class WhileGradOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind &op_desc,
-                  framework::BlockDescBind *block) const override {
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
     auto p_names = op_desc.Input(kParameters);
     auto pg_names = op_desc.Output(framework::GradVarName(kParameters));
 
@@ -321,10 +322,10 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
         continue;
       }
       auto dims = ctx->GetInputsElementDim(kParameters, i);
-      if (var_types[i] == framework::VarDesc::LOD_TENSOR) {
+      if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) {
         names_to_set.push_back(pg_names[i]);
         dims_to_set.push_back(dims);
-      } else if (var_types[i] == framework::VarDesc::LOD_TENSOR_ARRAY) {
+      } else if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR_ARRAY) {
         // not sure how to set the dim of LOD_TENSOR_ARRAY
         names_to_set.push_back(pg_names[i]);
         dims_to_set.push_back(dims);
diff --git a/paddle/optimizer/adadelta_optimizer.cc b/paddle/optimizer/adadelta_optimizer.cc
index 5cc7c47d4486c3d149c37fd6e312780f3d44eda8..8ca048257e53af21f327ee1b8f24b9e2e5d54060 100644
--- a/paddle/optimizer/adadelta_optimizer.cc
+++ b/paddle/optimizer/adadelta_optimizer.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "adadelta_optimizer.h"
 #include <algorithm>
diff --git a/paddle/optimizer/adadelta_optimizer.h b/paddle/optimizer/adadelta_optimizer.h
index 6aab1ad553b15ebbd2d04c9323c5e56e1b8f60f5..48f1ae175041cc1cfa11787c44317dd173430928 100644
--- a/paddle/optimizer/adadelta_optimizer.h
+++ b/paddle/optimizer/adadelta_optimizer.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/optimizer/adagrad_optimizer.cc b/paddle/optimizer/adagrad_optimizer.cc
index c981996bab1b2e7ae5d6e2d858a73efde12e32f3..c6d39a366adcd40e7298e1f1bed48c15f517a217 100644
--- a/paddle/optimizer/adagrad_optimizer.cc
+++ b/paddle/optimizer/adagrad_optimizer.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <cmath>
 
diff --git a/paddle/optimizer/adagrad_optimizer.h b/paddle/optimizer/adagrad_optimizer.h
index 447b7c7547d5bad7436df6f3b3582b4a219f08c8..b0cff061f5c311d2356bab4965ec65c5eca3dbb8 100644
--- a/paddle/optimizer/adagrad_optimizer.h
+++ b/paddle/optimizer/adagrad_optimizer.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/optimizer/adam_optimizer.cc b/paddle/optimizer/adam_optimizer.cc
index 6dc2d749708d0e2a7f36734d89eec30d4576842e..8a384b59c47df321438e3a3bc245a2398a7ef6d1 100644
--- a/paddle/optimizer/adam_optimizer.cc
+++ b/paddle/optimizer/adam_optimizer.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "adam_optimizer.h"
 #include <cmath>
diff --git a/paddle/optimizer/adam_optimizer.h b/paddle/optimizer/adam_optimizer.h
index 37ab53afc37a5f749a2909de12c7871ed926583f..7df40064df31536cead9eba4ce827452cf0dce38 100644
--- a/paddle/optimizer/adam_optimizer.h
+++ b/paddle/optimizer/adam_optimizer.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/optimizer/optimizer.cc b/paddle/optimizer/optimizer.cc
index faa23764522cef03bae1359adbf58d10ee7809ac..3af44484363654befe85c8511d7c0791d5a33ab8 100644
--- a/paddle/optimizer/optimizer.cc
+++ b/paddle/optimizer/optimizer.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "optimizer.h"
 #include <glog/logging.h>
diff --git a/paddle/optimizer/optimizer.h b/paddle/optimizer/optimizer.h
index e6fa12a4d250ccb078358704b0131942ea6ab039..516e612167f557f043d3fe29789bec4b73405755 100644
--- a/paddle/optimizer/optimizer.h
+++ b/paddle/optimizer/optimizer.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/optimizer/parameter_optimizer.cc b/paddle/optimizer/parameter_optimizer.cc
index da92c2d01cc2a27d1fadd51a338d23b01e0cb0bc..1603e5fdc8a69d8b117f02b9c950a5374625c632 100644
--- a/paddle/optimizer/parameter_optimizer.cc
+++ b/paddle/optimizer/parameter_optimizer.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <glog/logging.h>
 #include "adadelta_optimizer.h"
diff --git a/paddle/optimizer/parameter_optimizer.h b/paddle/optimizer/parameter_optimizer.h
index 99d0416e751c4ca6695d6ed77396e18d48fc86b8..1f501c49e1d7a8018f3274ae1128ca230b9298bb 100644
--- a/paddle/optimizer/parameter_optimizer.h
+++ b/paddle/optimizer/parameter_optimizer.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/optimizer/sgd_optimizer.cc b/paddle/optimizer/sgd_optimizer.cc
index c150144ac24b8375d08691a98be680b6bf5d1e7f..ee80f543fc4c1d87ba56540fc581816df8a39793 100644
--- a/paddle/optimizer/sgd_optimizer.cc
+++ b/paddle/optimizer/sgd_optimizer.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "sgd_optimizer.h"
 #include "serialization.h"
diff --git a/paddle/optimizer/sgd_optimizer.h b/paddle/optimizer/sgd_optimizer.h
index 0b1da0aa27d98e8d6a8d9fd7a1ebe355acb2a1f4..16a4df9973ef6a599a8ecee6be5903f13799771c 100644
--- a/paddle/optimizer/sgd_optimizer.h
+++ b/paddle/optimizer/sgd_optimizer.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 88df28a9668e5f354d115ff8ab32cb21e03aefb5..f0a0ea70a0aa14e1db959e4e6ace2a44363d0c35 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -25,7 +25,7 @@ ENDIF()
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
     system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
-nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
+nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
diff --git a/paddle/platform/call_once.h b/paddle/platform/call_once.h
index d9f49527dcf150fcb35d3af512088f75dec0b5c6..00337a7f051758559a0f8012d8c78dbe8e3457a6 100644
--- a/paddle/platform/call_once.h
+++ b/paddle/platform/call_once.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/platform/cuda_profiler.h b/paddle/platform/cuda_profiler.h
index b6311cb23d695c3cd851bcca120c24cced7fdd62..67d5f626d41c0fae280062533d6c1d1074341164 100644
--- a/paddle/platform/cuda_profiler.h
+++ b/paddle/platform/cuda_profiler.h
@@ -22,23 +22,7 @@ namespace paddle {
 namespace platform {
 
 void CudaProfilerInit(std::string output_file, std::string output_mode,
-                      std::vector<std::string> config_flags) {
-  std::array<char, 128> buf;
-  std::string tmpl = "/tmp/cuda_profile_config.XXXXXX";
-  PADDLE_ENFORCE_LT(tmpl.size(), buf.size());
-  memcpy(buf.data(), tmpl.data(), tmpl.size());
-  auto result = mktemp(buf.data());
-  PADDLE_ENFORCE(strlen(result) != 0);
-  std::string config_file = result;
-
-  {
-    std::ofstream ofs(config_file, std::ios::out | std::ios::trunc);
-    PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate());
-    for (const auto& line : config_flags) {
-      ofs << line << std::endl;
-    }
-  }
-
+                      std::string config_file) {
   PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
   cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
   PADDLE_ENFORCE(
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 1c72b5055971e73c7aa560a61ca9d3c48dc56fbc..ea07f2e002cb76d09a11f7a5305c2d45b780e7bd 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -15,11 +15,46 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+DeviceContextPool* DeviceContextPool::pool = nullptr;
+
+const platform::DeviceContext* DeviceContextPool::Get(
+    const platform::Place& place) {
+  auto it = device_contexts_.find(place);
+  if (it == device_contexts_.end()) {
+    PADDLE_THROW(
+        "'Place' is not supported, Please re-compile with WITH_GPU "
+        "option");
+  }
+  return it->second;
+}
+
+DeviceContextPool::DeviceContextPool(
+    const std::vector<platform::Place>& places) {
+  PADDLE_ENFORCE_GT(places.size(), 0);
+  for (size_t i = 0; i < places.size(); i++) {
+    if (platform::is_cpu_place(places[i])) {
+      device_contexts_.emplace(places[i],
+                               new platform::CPUDeviceContext(
+                                   boost::get<platform::CPUPlace>(places[i])));
+    } else if (platform::is_gpu_place(places[i])) {
+#ifdef PADDLE_WITH_CUDA
+      device_contexts_.emplace(places[i],
+                               new platform::CUDADeviceContext(
+                                   boost::get<platform::CUDAPlace>(places[i])));
+#else
+      PADDLE_THROW(
+          "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
+          "option");
+#endif
+    }
+  }
+}
+
 CPUDeviceContext::CPUDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
 
-CPUDeviceContext::CPUDeviceContext(CPUPlace place) {
+CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) {
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
 
@@ -27,7 +62,7 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
   return eigen_device_.get();
 }
 
-Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
+Place CPUDeviceContext::GetPlace() const { return place_; }
 
 #ifdef PADDLE_WITH_CUDA
 
@@ -38,7 +73,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   }
   ~EigenCudaStreamDevice() override {}
 
-  void Reinitialize(const cudaStream_t* cuda_stream, GPUPlace place) {
+  void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) {
     stream_ = cuda_stream;
     place_ = place;
     device_prop_ = &Eigen::m_deviceProperties[place.device];
@@ -77,14 +112,14 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   }
 
  private:
-  GPUPlace place_;
+  CUDAPlace place_;
   const cudaStream_t* stream_;         // not owned;
   const cudaDeviceProp* device_prop_;  // not owned;
   mutable void* scratch_;
   mutable unsigned int* semaphore_;
 };
 
-CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
+CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
   SetDeviceId(place_.device);
   PADDLE_ENFORCE(cudaStreamCreate(&stream_));
   eigen_stream_.reset(new EigenCudaStreamDevice());
@@ -125,21 +160,19 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
 
 cudaStream_t CUDADeviceContext::stream() const { return stream_; }
 
-CudnnDeviceContext::CudnnDeviceContext(CudnnPlace place)
-    : CUDADeviceContext(place), place_(place) {
+CUDNNDeviceContext::CUDNNDeviceContext(CUDAPlace place)
+    : CUDADeviceContext(place) {
   PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
   PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream()));
 }
 
-CudnnDeviceContext::~CudnnDeviceContext() {
-  SetDeviceId(place_.device);
+CUDNNDeviceContext::~CUDNNDeviceContext() {
+  SetDeviceId(boost::get<CUDAPlace>(GetPlace()).device);
   Wait();
   PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
 }
 
-Place CudnnDeviceContext::GetPlace() const { return CudnnPlace(); }
-
-cudnnHandle_t CudnnDeviceContext::cudnn_handle() const { return cudnn_handle_; }
+cudnnHandle_t CUDNNDeviceContext::cudnn_handle() const { return cudnn_handle_; }
 
 #endif
 
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index f67194993db1f4160bd6894b2c845a82f4da2354..dfef2c16d8f2277d57cbcfe51d108402e518799b 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -11,8 +11,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/place.h"
+#include <memory>
+#include <unordered_map>
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/platform/dynload/cublas.h"
@@ -20,10 +20,13 @@ limitations under the License. */
 #include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
-#include <memory>
+
+#include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
+#include "glog/logging.h"
+
 namespace paddle {
 namespace platform {
 
@@ -45,6 +48,7 @@ class CPUDeviceContext : public DeviceContext {
   Place GetPlace() const override;
 
  private:
+  CPUPlace place_;
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };
 
@@ -54,7 +58,7 @@ class EigenCudaStreamDevice;
 
 class CUDADeviceContext : public DeviceContext {
  public:
-  explicit CUDADeviceContext(GPUPlace place);
+  explicit CUDADeviceContext(CUDAPlace place);
   virtual ~CUDADeviceContext();
 
   /*! \brief  Wait for all operations completion in the stream. */
@@ -76,7 +80,7 @@ class CUDADeviceContext : public DeviceContext {
   cudaStream_t stream() const;
 
  private:
-  GPUPlace place_;
+  CUDAPlace place_;
 
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;
   std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
@@ -86,23 +90,59 @@ class CUDADeviceContext : public DeviceContext {
   cublasHandle_t cublas_handle_;
 };
 
-class CudnnDeviceContext : public CUDADeviceContext {
+class CUDNNDeviceContext : public CUDADeviceContext {
  public:
-  explicit CudnnDeviceContext(CudnnPlace place);
-  virtual ~CudnnDeviceContext();
-
-  /*! \brief  Return place in the device context. */
-  Place GetPlace() const final;
+  explicit CUDNNDeviceContext(CUDAPlace place);
+  virtual ~CUDNNDeviceContext();
 
   /*! \brief  Return cudnn  handle in the device context. */
   cudnnHandle_t cudnn_handle() const;
 
  private:
   cudnnHandle_t cudnn_handle_;
-  CudnnPlace place_;
 };
 
 #endif
 
+/*! \brief device context pool singleton */
+class DeviceContextPool {
+ public:
+  explicit DeviceContextPool(const std::vector<platform::Place>& places);
+
+  static DeviceContextPool& Instance() {
+    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
+    return *pool;
+  }
+
+  /*! \brief  Create should only called by Init function */
+  static DeviceContextPool& Init(const std::vector<platform::Place>& places) {
+    if (pool == nullptr) {
+      pool = new DeviceContextPool(places);
+    }
+    return *pool;
+  }
+
+  /*! \brief  Return handle of single device context. */
+  const platform::DeviceContext* Get(const platform::Place& place);
+
+ private:
+  static DeviceContextPool* pool;
+  constexpr static int LEFT_SHIFT = 8;
+  struct Hash {
+    std::hash<int> hash_;
+    size_t operator()(const platform::Place& place) const {
+      int pre_hash = place.which() + (1 << LEFT_SHIFT);
+      if (platform::is_gpu_place(place)) {
+        pre_hash += boost::get<platform::CUDAPlace>(place).GetDeviceId();
+      }
+      return hash_(pre_hash);
+    }
+  };
+  std::unordered_map<const platform::Place, const platform::DeviceContext*,
+                     Hash>
+      device_contexts_;
+  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
+};
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cu
similarity index 55%
rename from paddle/platform/device_context_test.cc
rename to paddle/platform/device_context_test.cu
index be3b2af5af09cb18f5156412ff60a7fc15a16487..ca10cf34639376798bf5ba05970c9c734e5a1ef8 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cu
@@ -12,17 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/device_context.h"
 #include "gtest/gtest.h"
+#include "paddle/platform/device_context.h"
+
+#include "glog/logging.h"
 
 TEST(Device, Init) {
   using paddle::platform::DeviceContext;
   using paddle::platform::CUDADeviceContext;
-  using paddle::platform::GPUPlace;
+  using paddle::platform::CUDAPlace;
 
   int count = paddle::platform::GetCUDADeviceCount();
   for (int i = 0; i < count; i++) {
-    CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
+    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
     Eigen::GpuDevice* gpu_device = device_context->eigen_device();
     ASSERT_NE(nullptr, gpu_device);
     delete device_context;
@@ -31,11 +33,11 @@ TEST(Device, Init) {
 
 TEST(Device, CUDADeviceContext) {
   using paddle::platform::CUDADeviceContext;
-  using paddle::platform::GPUPlace;
+  using paddle::platform::CUDAPlace;
 
   int count = paddle::platform::GetCUDADeviceCount();
   for (int i = 0; i < count; i++) {
-    CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
+    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
     Eigen::GpuDevice* gpu_device = device_context->eigen_device();
     ASSERT_NE(nullptr, gpu_device);
     cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
@@ -47,14 +49,13 @@ TEST(Device, CUDADeviceContext) {
   }
 }
 
-TEST(Device, CudnnDeviceContext) {
-  using paddle::platform::CudnnDeviceContext;
-  using paddle::platform::CudnnPlace;
+TEST(Device, CUDNNDeviceContext) {
+  using paddle::platform::CUDNNDeviceContext;
+  using paddle::platform::CUDAPlace;
   if (paddle::platform::dynload::HasCUDNN()) {
     int count = paddle::platform::GetCUDADeviceCount();
     for (int i = 0; i < count; ++i) {
-      CudnnDeviceContext* device_context =
-          new CudnnDeviceContext(CudnnPlace(i));
+      CUDNNDeviceContext* device_context = new CUDNNDeviceContext(CUDAPlace(i));
       cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
       ASSERT_NE(nullptr, cudnn_handle);
       ASSERT_NE(nullptr, device_context->stream());
@@ -62,3 +63,39 @@ TEST(Device, CudnnDeviceContext) {
     }
   }
 }
+
+TEST(Device, DeviceContextPool) {
+  using paddle::platform::DeviceContextPool;
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::Place;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::CUDAPlace;
+
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  auto cpu_dev_ctx1 = pool.Get(CPUPlace());
+  auto cpu_dev_ctx2 = pool.Get(CPUPlace());
+  ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
+
+  std::vector<Place> gpu_places;
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    auto dev_ctx = pool.Get(CUDAPlace(i));
+    ASSERT_NE(dev_ctx, nullptr);
+  }
+}
+
+int main(int argc, char** argv) {
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::CUDAPlace(i));
+  }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Init(places);
+
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/platform/dynload/nccl.cc b/paddle/platform/dynload/nccl.cc
index 8f92b8d94d56047b7d3fb43b15e3c06575c8d57b..4cec829a8ad8994d4a7643613331881e3a397b9a 100644
--- a/paddle/platform/dynload/nccl.cc
+++ b/paddle/platform/dynload/nccl.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/platform/dynload/nccl.h"
 
@@ -25,6 +25,11 @@ void *nccl_dso_handle;
 
 NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
+void LoadNCCLDSO() {
+  platform::call_once(nccl_dso_flag,
+                      [] { GetNCCLDsoHandle(&nccl_dso_handle); });
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h
index 981b2ab258a34ce92f02ee12b5957f88ba61d1c0..6c776afc97a53c964f0bc2b2a8abf2c29f474d3f 100644
--- a/paddle/platform/dynload/nccl.h
+++ b/paddle/platform/dynload/nccl.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -28,18 +28,18 @@ extern std::once_flag nccl_dso_flag;
 extern void* nccl_dso_handle;
 
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                         \
-  struct DynLoad__##__name {                                           \
-    template <typename... Args>                                        \
-    auto operator()(Args... args) -> decltype(__name(args...)) {       \
-      using nccl_func = decltype(__name(args...)) (*)(Args...);        \
-      platform::call_once(nccl_dso_flag,                               \
-                          paddle::platform::dynload::GetNCCLDsoHandle, \
-                          &nccl_dso_handle);                           \
-      void* p_##__name = dlsym(nccl_dso_handle, #__name);              \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);         \
-    }                                                                  \
-  };                                                                   \
+extern void LoadNCCLDSO();
+
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                   \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      using nccl_func = decltype(__name(args...)) (*)(Args...);  \
+      paddle::platform::dynload::LoadNCCLDSO();                  \
+      void* p_##__name = dlsym(nccl_dso_handle, #__name);        \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
+    }                                                            \
+  };                                                             \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
@@ -63,6 +63,8 @@ extern void* nccl_dso_handle;
   __macro(ncclAllReduce);               \
   __macro(ncclBcast);                   \
   __macro(ncclAllGather);               \
+  __macro(ncclGroupStart);              \
+  __macro(ncclGroupEnd);                \
   __macro(ncclReduce);                  \
   __macro(ncclGetErrorString);
 
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index 5abd4d4a345ed2750231841325f2b19a2ee8c4c9..d1c7be0790b5e11d6273efe6c08cdb7bf22425c6 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 
+#include "paddle/platform/macros.h"
 #include "paddle/string/printf.h"
 #include "paddle/string/to_string.h"
 
diff --git a/paddle/platform/for_range.h b/paddle/platform/for_range.h
new file mode 100644
index 0000000000000000000000000000000000000000..5427aa28238d6b46eb72d1fb49303dce3d871d7d
--- /dev/null
+++ b/paddle/platform/for_range.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace platform {
+
+template <typename DeviceContext>
+struct ForRange {
+  ForRange(const DeviceContext& dev_ctx, size_t limit);
+
+  template <typename Function>
+  void operator()(Function func) const;
+};
+
+template <>
+struct ForRange<CPUDeviceContext> {
+  ForRange(const CPUDeviceContext& dev_ctx, size_t limit) : limit_(limit) {}
+
+  template <typename Function>
+  void operator()(Function func) const {
+    for (size_t i = 0; i < limit_; ++i) {
+      func(i);
+    }
+  }
+
+  size_t limit_;
+};
+
+#ifdef __NVCC__
+template <typename Function>
+__global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
+  size_t idx = static_cast<size_t>(threadIdx.x);
+  func(idx);
+}
+
+template <typename Function>
+__global__ static void ForRangeElemwiseOp(Function func, int limit) {
+  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx < limit) {
+    func(idx);
+  }
+}
+
+template <>
+struct ForRange<CUDADeviceContext> {
+  ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
+      : dev_ctx_(dev_ctx), limit_(static_cast<int>(limit)) {}
+
+  template <typename Function>
+  inline void operator()(Function func) const {
+    constexpr size_t num_threads = 1024;
+    int block_size = limit_ <= num_threads ? limit_ : num_threads;
+    int grid_size = (limit_ + num_threads - 1) / num_threads;
+
+    if (grid_size == 1) {
+      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
+          func);
+    } else {
+      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
+          func, limit_);
+    }
+  }
+
+  const CUDADeviceContext& dev_ctx_;
+  int limit_;
+};
+
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index 4fa2eaed31c6e9368459c2da6f8b0667b453d58c..7037551d7544d6fea54e2f4bf887309b7dc5a52e 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -73,19 +73,20 @@ size_t GpuMaxChunkSize() {
   size_t available = 0;
 
   GpuMemoryUsage(available, total);
-
-  // Reserving the rest memory for page tables, etc.
-  size_t reserving = 0.05 * total;
-
+  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
+           << total / 1024 / 1024 << "M";
+  size_t reserving = static_cast<size_t>(0.05 * total);
   // If available less than minimum chunk size, no usable memory exists.
   available =
-      std::max(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
-               reserving) -
-      reserving;
+      std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
+               total - reserving);
+
+  // Reserving the rest memory for page tables, etc.
 
-  size_t allocating = FLAGS_fraction_of_gpu_memory_to_use * total;
+  size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
+                                          (total - reserving));
 
-  PADDLE_ENFORCE_LT(allocating, available);
+  PADDLE_ENFORCE_LE(allocating, available);
 
   return allocating;
 }
@@ -96,17 +97,6 @@ void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                  "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
 }
 
-void GpuMemcpySync(void *dst, const void *src, size_t count,
-                   enum cudaMemcpyKind kind) {
-  PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
-                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync");
-  // note: cudaMemcpy may actually be asynchronous with respect to the caller,
-  //       block on stream 0 to make sure the copy has completed
-  PADDLE_ENFORCE(
-      cudaStreamSynchronize(0),
-      "cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync");
-}
-
 void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
                    size_t count, cudaStream_t stream) {
   PADDLE_ENFORCE(
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index db961f3838af73855312d4cf6a80e2355306e08f..d05131fa4196057d19a8ae57bf4574c666e409cf 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -52,10 +52,6 @@ size_t GpuMaxChunkSize();
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                     enum cudaMemcpyKind kind, cudaStream_t stream);
 
-//! Copy memory from address src to dst synchronously.
-void GpuMemcpySync(void *dst, const void *src, size_t count,
-                   enum cudaMemcpyKind kind);
-
 //! Copy memory from one device to another device.
 void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
                    size_t count, cudaStream_t stream);
diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu
index c99dae68bef67c58d3efea42fef45e84bb3d9255..ef6d845874745af1150e4425f8d6be416cc44ece 100644
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@@ -1,28 +1,30 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/device_vector.h>
+#include <memory>
+#include <vector>
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/dynload/nccl.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/gpu_info.h"
 
-#include <thrust/device_vector.h>
-#include <memory>
-#include <vector>
-
 static int dev_count = 0;
 
 namespace paddle {
@@ -32,6 +34,7 @@ TEST(NCCL, init) {
   std::vector<ncclComm_t> comms;
   comms.resize(dev_count);
   PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
+
   for (int i = 0; i < dev_count; ++i) {
     dynload::ncclCommDestroy(comms[i]);
   }
@@ -47,7 +50,7 @@ struct PerThreadData {
 
   T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); }
 
-  PerThreadData(int gpu_id, size_t size) : dev_ctx(GPUPlace(gpu_id)) {
+  PerThreadData(int gpu_id, size_t size) : dev_ctx(CUDAPlace(gpu_id)) {
     send_buff.resize(size);
     for (size_t i = 0; i < size; ++i) {
       send_buff[i] = static_cast<T>(i);
@@ -62,7 +65,7 @@ TEST(NCCL, all_reduce) {
   std::vector<ncclComm_t> comms;
   comms.resize(dev_count);
   VLOG(1) << "Initializing ncclComm";
-  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
+  dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
   VLOG(1) << "ncclComm initialized";
   VLOG(1) << "Creating thread data";
   std::vector<std::unique_ptr<PerThreadData<double>>> data;
@@ -131,6 +134,18 @@ int main(int argc, char** argv) {
         << dev_count;
     return 0;
   }
+
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::CUDAPlace(i));
+  }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Init(places);
+
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
index 25fe8d21b49b07a6afe2938245906dc1bdd90398..249527e3e136992970033c44ad490a1744bfed35 100644
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/platform/place.h"
 
@@ -23,8 +23,9 @@ class PlacePrinter : public boost::static_visitor<> {
  public:
   explicit PlacePrinter(std::ostream &os) : os_(os) {}
   void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
-  void operator()(const MKLDNNPlace &) { os_ << "MKLDNNPlace"; }
-  void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; }
+  void operator()(const CUDAPlace &p) {
+    os_ << "CUDAPlace(" << p.device << ")";
+  }
 
  private:
   std::ostream &os_;
@@ -37,20 +38,14 @@ static Place the_default_place;
 void set_place(const Place &place) { the_default_place = place; }
 const Place &get_place() { return the_default_place; }
 
-const GPUPlace default_gpu() { return GPUPlace(0); }
+const CUDAPlace default_gpu() { return CUDAPlace(0); }
 const CPUPlace default_cpu() { return CPUPlace(); }
-const MKLDNNPlace default_mkldnn() { return MKLDNNPlace(); }
 
 bool is_gpu_place(const Place &p) {
-  return boost::apply_visitor(IsGPUPlace(), p);
-}
-bool is_cpu_place(const Place &p) {
-  return !is_gpu_place(p) && !is_mkldnn_place(p);
+  return boost::apply_visitor(IsCUDAPlace(), p);
 }
 
-bool is_mkldnn_place(const Place &p) {
-  return boost::apply_visitor(IsMKLDNNPlace(), p);
-}
+bool is_cpu_place(const Place &p) { return !is_gpu_place(p); }
 
 bool places_are_same_class(const Place &p1, const Place &p2) {
   return p1.which() == p2.which();
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index 4526945792b2ea96cc4e9df11d8f35897cba7526..d25eaa689f4a4baa951db5c61bbf99288e365ee1 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -31,65 +31,35 @@ struct CPUPlace {
   inline bool operator!=(const CPUPlace &) const { return false; }
 };
 
-struct MKLDNNPlace {
-  MKLDNNPlace() {}
-
-  // needed for variant equality comparison
-  inline bool operator==(const MKLDNNPlace &) const { return true; }
-  inline bool operator!=(const MKLDNNPlace &) const { return false; }
-};
-
-struct GPUPlace {
-  GPUPlace() : GPUPlace(0) {}
-  explicit GPUPlace(int d) : device(d) {}
+struct CUDAPlace {
+  CUDAPlace() : CUDAPlace(0) {}
+  explicit CUDAPlace(int d) : device(d) {}
 
   inline int GetDeviceId() const { return device; }
   // needed for variant equality comparison
-  inline bool operator==(const GPUPlace &o) const { return device == o.device; }
-  inline bool operator!=(const GPUPlace &o) const { return !(*this == o); }
+  inline bool operator==(const CUDAPlace &o) const {
+    return device == o.device;
+  }
+  inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
 
   int device;
 };
 
-struct CudnnPlace : public GPUPlace {
-  CudnnPlace() : GPUPlace() {}
-  explicit CudnnPlace(int d) : GPUPlace(d) {}
-};
-
-struct IsGPUPlace : public boost::static_visitor<bool> {
+struct IsCUDAPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const MKLDNNPlace &) const { return false; }
-  bool operator()(const GPUPlace &gpu) const { return true; }
+  bool operator()(const CUDAPlace &gpu) const { return true; }
 };
 
-struct IsMKLDNNPlace : public boost::static_visitor<bool> {
-  bool operator()(const MKLDNNPlace &) const { return true; }
-  bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const GPUPlace &) const { return false; }
-};
-
-// Define the max number of Place in bit length. i.e., the max number of places
-// should be less equal than 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
-#define NUM_PLACE_TYPE_LIMIT_IN_BIT 4
-
-typedef boost::variant<CudnnPlace, GPUPlace, CPUPlace, MKLDNNPlace> Place;
-
-// static check number of place types is less equal than
-// 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
-BOOST_MPL_ASSERT((boost::mpl::less_equal<
-                  Place::types::size,
-                  boost::mpl::long_<1 << NUM_PLACE_TYPE_LIMIT_IN_BIT>>));
+typedef boost::variant<CUDAPlace, CPUPlace> Place;
 
 void set_place(const Place &);
 const Place &get_place();
 
-const GPUPlace default_gpu();
+const CUDAPlace default_gpu();
 const CPUPlace default_cpu();
-const MKLDNNPlace default_mkldnn();
 
 bool is_gpu_place(const Place &);
 bool is_cpu_place(const Place &);
-bool is_mkldnn_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
 
 std::ostream &operator<<(std::ostream &, const Place &);
diff --git a/paddle/platform/place_test.cc b/paddle/platform/place_test.cc
index 184af12c230f1ccd7826e507f16f4e91ca380a45..4f1eba01df5531529ad3c79648b5e7f8651df619 100644
--- a/paddle/platform/place_test.cc
+++ b/paddle/platform/place_test.cc
@@ -4,7 +4,7 @@
 
 TEST(Place, Equality) {
   paddle::platform::CPUPlace cpu;
-  paddle::platform::GPUPlace g0(0), g1(1), gg0(0);
+  paddle::platform::CUDAPlace g0(0), g1(1), gg0(0);
 
   EXPECT_EQ(cpu, cpu);
   EXPECT_EQ(g0, g0);
@@ -21,22 +21,17 @@ TEST(Place, Default) {
   EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::get_place()));
   EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu()));
   EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu()));
-  EXPECT_TRUE(
-      paddle::platform::is_mkldnn_place(paddle::platform::default_mkldnn()));
 
+  EXPECT_FALSE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
   paddle::platform::set_place(paddle::platform::CPUPlace());
   EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
-
-  paddle::platform::set_place(paddle::platform::MKLDNNPlace());
-  EXPECT_FALSE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
-  EXPECT_TRUE(paddle::platform::is_mkldnn_place(paddle::platform::get_place()));
 }
 
 TEST(Place, Print) {
   {
     std::stringstream ss;
-    ss << paddle::platform::GPUPlace(1);
-    EXPECT_EQ("GPUPlace(1)", ss.str());
+    ss << paddle::platform::CUDAPlace(1);
+    EXPECT_EQ("CUDAPlace(1)", ss.str());
   }
   {
     std::stringstream ss;
diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h
index 148ebaed3d893cd03df8cf27b1309d07afd9aa4a..a88902b164c7a705cae09662e724b35cd8c8b4bf 100644
--- a/paddle/platform/transform.h
+++ b/paddle/platform/transform.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu
index d36eac8379ebedb284b36012a46186cd3ac43b91..af9204a0a7b6e09fcfdacb8ba985e269665b4034 100644
--- a/paddle/platform/transform_test.cu
+++ b/paddle/platform/transform_test.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "paddle/memory/memcpy.h"
@@ -49,15 +49,15 @@ TEST(Transform, CPUUnary) {
 TEST(Transform, GPUUnary) {
   using namespace paddle::platform;
   using namespace paddle::memory;
-  GPUPlace gpu0(0);
+  CUDAPlace gpu0(0);
   CUDADeviceContext ctx(gpu0);
   float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
   float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
-  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf));
+  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
   Transform<paddle::platform::CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
   ctx.Wait();
-  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf));
+  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
   Free(gpu0, gpu_buf);
   for (int i = 0; i < 4; ++i) {
     ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
@@ -80,14 +80,14 @@ TEST(Transform, GPUBinary) {
   using namespace paddle::platform;
   using namespace paddle::memory;
   int buf[4] = {1, 2, 3, 4};
-  GPUPlace gpu0(0);
+  CUDAPlace gpu0(0);
   CUDADeviceContext ctx(gpu0);
   int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
-  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf));
+  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
   Transform<paddle::platform::CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
   ctx.Wait();
-  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf));
+  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
   Free(gpu0, gpu_buf);
   for (int i = 0; i < 4; ++i) {
     ASSERT_EQ((i + 1) * (i + 1), buf[i]);
diff --git a/paddle/platform/variant.h b/paddle/platform/variant.h
index 619897ca19eb2e6f4dbfd9160edf8c4bc58c89a9..ea6ef8fddf5171731fa1fd323983bf7d90ecdd9a 100644
--- a/paddle/platform/variant.h
+++ b/paddle/platform/variant.h
@@ -1,19 +1,32 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
+#ifdef __CUDACC__
+#ifdef __CUDACC_VER_MAJOR__
+// CUDA 9 define `__CUDACC_VER__` as a warning message, manually define
+// __CUDACC_VER__ instead.
+#undef __CUDACC_VER__
+
+#define __CUDACC_VER__                                         \
+  (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 + \
+   __CUDACC_VER_BUILD__)
+#endif
+
+#endif
+
 #include <boost/config.hpp>
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index fd55f410d3f0fee418e7efffa927e46c38d23a07..6afed7eec7001b646d55cef0bc3f59782b80b15f 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1,7 +1,7 @@
 if(WITH_PYTHON)
   cc_library(paddle_pybind SHARED
-    SRCS pybind.cc exception.cc protobuf.cc
-    DEPS pybind python backward proto_desc paddle_memory executor prune
+    SRCS pybind.cc exception.cc protobuf.cc const_value.cc
+    DEPS pybind python backward proto_desc paddle_memory executor prune init
     ${GLOB_OP_LIB})
 endif(WITH_PYTHON)
 
diff --git a/paddle/pybind/const_value.cc b/paddle/pybind/const_value.cc
new file mode 100644
index 0000000000000000000000000000000000000000..761635aa5e5eac445c2ec8331b0dc37ffd11248c
--- /dev/null
+++ b/paddle/pybind/const_value.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "const_value.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindConstValue(pybind11::module& m) {
+  m.def("kEmptyVarName", [] { return framework::kEmptyVarName; });
+  m.def("kTempVarName", [] { return framework::kTempVarName; });
+  m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
+  m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
+
+  // for kernel_hint key
+  m.def("kUseCPU", [] { return framework::kUseCPU; });
+  m.def("kUseCUDNN", [] { return framework::kUseCUDNN; });
+  m.def("kUseMKLDNN", [] { return framework::kUseMKLDNN; });
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/pybind/const_value.h b/paddle/pybind/const_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d57c972a9d5339c0e155fa1d6395af9face8744
--- /dev/null
+++ b/paddle/pybind/const_value.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <Python.h>
+#include "paddle/platform/enforce.h"
+#include "pybind11/pybind11.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+extern void BindConstValue(pybind11::module& m);
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/pybind/exception.cc b/paddle/pybind/exception.cc
index ff79b12ee4b28c53ee04f4c170b5bca9ca28d14a..e29ac3ebab760a011a3798f8e4be46270d6b80cc 100644
--- a/paddle/pybind/exception.cc
+++ b/paddle/pybind/exception.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/pybind/exception.h"
 
diff --git a/paddle/pybind/exception.h b/paddle/pybind/exception.h
index 70beac146046f74e23f747bab130483901a7d443..436ddd5707ace37a5668c8d4401c1bdcf2dadfe3 100644
--- a/paddle/pybind/exception.h
+++ b/paddle/pybind/exception.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <Python.h>
diff --git a/paddle/pybind/print_operators_doc.cc b/paddle/pybind/print_operators_doc.cc
index 24f2a9383f7a069f1a8c7ed2bf3da46720470efa..f4f281229e611a6c9c8e9ecd54e0097ab683bbf3 100644
--- a/paddle/pybind/print_operators_doc.cc
+++ b/paddle/pybind/print_operators_doc.cc
@@ -31,31 +31,32 @@ std::string Escape(const std::string& s) {
   return r;
 }
 
-std::string AttrType(paddle::framework::AttrType at) {
+std::string AttrType(paddle::framework::proto::AttrType at) {
   switch (at) {
-    case paddle::framework::INT:
+    case paddle::framework::proto::INT:
       return "int";
-    case paddle::framework::FLOAT:
+    case paddle::framework::proto::FLOAT:
       return "float";
-    case paddle::framework::STRING:
+    case paddle::framework::proto::STRING:
       return "string";
-    case paddle::framework::BOOLEAN:
+    case paddle::framework::proto::BOOLEAN:
       return "bool";
-    case paddle::framework::INTS:
+    case paddle::framework::proto::INTS:
       return "int array";
-    case paddle::framework::FLOATS:
+    case paddle::framework::proto::FLOATS:
       return "float array";
-    case paddle::framework::STRINGS:
+    case paddle::framework::proto::STRINGS:
       return "string array";
-    case paddle::framework::BOOLEANS:
+    case paddle::framework::proto::BOOLEANS:
       return "bool array";
-    case paddle::framework::BLOCK:
+    case paddle::framework::proto::BLOCK:
       return "block id";
   }
   return "UNKNOWN";  // not possible
 }
 
-void PrintVar(const paddle::framework::OpProto::Var& v, std::stringstream& ss) {
+void PrintVar(const paddle::framework::proto::OpProto::Var& v,
+              std::stringstream& ss) {
   ss << " { "
      << "\n"
      << "   \"name\" : \"" << Escape(v.name()) << "\",\n"
@@ -65,7 +66,7 @@ void PrintVar(const paddle::framework::OpProto::Var& v, std::stringstream& ss) {
      << " },";
 }
 
-void PrintAttr(const paddle::framework::OpProto::Attr& a,
+void PrintAttr(const paddle::framework::proto::OpProto::Attr& a,
                std::stringstream& ss) {
   ss << " { "
      << "\n"
@@ -81,7 +82,7 @@ void PrintOpProto(const std::string& type,
                   std::stringstream& ss) {
   std::cerr << "Processing " << type << "\n";
 
-  const paddle::framework::OpProto* p = opinfo.proto_;
+  const paddle::framework::proto::OpProto* p = opinfo.proto_;
   if (p == nullptr) {
     return;  // It is possible that an operator doesn't have OpProto.
   }
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 6c8f06cccb92fa9cd22fdb89a9d410e6853895cc..07292d47e9c165c67fe4a30ee7d851c350beb2e0 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -108,21 +108,21 @@ static py::bytes SerializeMessage(T &self) {
 
 // Bind Methods
 void BindProgramDesc(py::module &m) {
-  py::class_<ProgramDescBind>(m, "ProgramDesc", "")
+  py::class_<ProgramDesc>(m, "ProgramDesc", "")
       .def(py::init<>())
       .def("__init__",
-           [](ProgramDescBind &self, const ProgramDescBind &other) {
-             new (&self) ProgramDescBind(other);
+           [](ProgramDesc &self, const ProgramDesc &other) {
+             new (&self) ProgramDesc(other);
            })
       .def("__init__",
-           [](ProgramDescBind &self, const py::bytes &binary_str) {
+           [](ProgramDesc &self, const py::bytes &binary_str) {
              std::string str(binary_str);
-             new (&self) ProgramDescBind(str);
+             new (&self) ProgramDesc(str);
            })
-      .def("append_block", &ProgramDescBind::AppendBlock,
+      .def("append_block", &ProgramDesc::AppendBlock,
            py::return_value_policy::reference)
       .def("append_backward",
-           [](ProgramDescBind &program_desc, const VarDescBind &target,
+           [](ProgramDesc &program_desc, const VarDesc &target,
               const std::unordered_set<std::string> &no_grad_vars) {
              ParamGradInfoMap param_grad_map =
                  AppendBackward(program_desc, target, no_grad_vars);
@@ -138,13 +138,13 @@ void BindProgramDesc(py::module &m) {
              }
              return retv;
            })
-      .def("block", &ProgramDescBind::MutableBlock,
+      .def("block", &ProgramDesc::MutableBlock,
            py::return_value_policy::reference)
-      .def("num_blocks", &ProgramDescBind::Size)
-      .def("serialize_to_string", SerializeMessage<ProgramDescBind>)
+      .def("num_blocks", &ProgramDesc::Size)
+      .def("serialize_to_string", SerializeMessage<ProgramDesc>)
       .def("parse_from_string",
-           [](ProgramDescBind &program_desc, const std::string &data) {
-             ProgramDesc *desc = program_desc.Proto();
+           [](ProgramDesc &program_desc, const std::string &data) {
+             proto::ProgramDesc *desc = program_desc.Proto();
              PADDLE_ENFORCE(desc->ParseFromString(data),
                             "Fail to parse ProgramDesc from string. This could "
                             "be a bug of Paddle.");
@@ -152,109 +152,134 @@ void BindProgramDesc(py::module &m) {
 }
 
 void BindBlockDesc(py::module &m) {
-  py::class_<BlockDescBind>(m, "BlockDesc", "")
-      .def_property_readonly("id", &BlockDescBind::ID)
-      .def_property_readonly("parent", &BlockDescBind::Parent)
-      .def("append_op", &BlockDescBind::AppendOp,
+  py::class_<BlockDesc>(m, "BlockDesc", "")
+      .def_property_readonly("id", &BlockDesc::ID)
+      .def_property_readonly("parent", &BlockDesc::Parent)
+      .def("append_op", &BlockDesc::AppendOp,
            py::return_value_policy::reference)
-      .def("prepend_op", &BlockDescBind::PrependOp,
+      .def("prepend_op", &BlockDesc::PrependOp,
            py::return_value_policy::reference)
+      .def("remove_op", &BlockDesc::RemoveOp)
       .def("var",
-           [](BlockDescBind &self, py::bytes byte_name) {
+           [](BlockDesc &self, py::bytes byte_name) {
              std::string name = byte_name;
              return self.Var(name);
            },
            py::return_value_policy::reference)
       .def("has_var",
-           [](BlockDescBind &self, py::bytes byte_name) {
+           [](BlockDesc &self, py::bytes byte_name) {
              std::string name = byte_name;
              return self.HasVar(name);
            })
+      .def("has_var_recursive",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.HasVarRecursive(name);
+           })
       .def("find_var",
-           [](BlockDescBind &self, py::bytes byte_name) {
+           [](BlockDesc &self, py::bytes byte_name) {
              std::string name = byte_name;
              return self.FindVar(name);
            },
            py::return_value_policy::reference)
-      .def("all_vars", &BlockDescBind::AllVars,
+      .def("find_var_recursive",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.FindVarRecursive(name);
+           },
            py::return_value_policy::reference)
-      .def("op_size", &BlockDescBind::OpSize)
-      .def("op", &BlockDescBind::Op, py::return_value_policy::reference)
-      .def("serialize_to_string", SerializeMessage<BlockDescBind>);
+      .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
+      .def("op_size", &BlockDesc::OpSize)
+      .def("op", &BlockDesc::Op, py::return_value_policy::reference)
+      .def("serialize_to_string", SerializeMessage<BlockDesc>);
 }
 
 void BindVarDsec(py::module &m) {
-  py::enum_<DataType>(m, "DataType", "")
-      .value("BOOL", DataType::BOOL)
-      .value("INT16", DataType::INT16)
-      .value("INT32", DataType::INT32)
-      .value("INT64", DataType::INT64)
-      .value("FP16", DataType::FP16)
-      .value("FP32", DataType::FP32)
-      .value("FP64", DataType::FP64);
+  py::enum_<proto::DataType>(m, "DataType", "")
+      .value("BOOL", proto::DataType::BOOL)
+      .value("INT16", proto::DataType::INT16)
+      .value("INT32", proto::DataType::INT32)
+      .value("INT64", proto::DataType::INT64)
+      .value("FP16", proto::DataType::FP16)
+      .value("FP32", proto::DataType::FP32)
+      .value("FP64", proto::DataType::FP64);
 
-  py::class_<VarDescBind> var_desc(m, "VarDesc", "");
+  py::class_<VarDesc> var_desc(m, "VarDesc", "");
   var_desc
       .def("name",
-           [](const VarDescBind &self) {
+           [](const VarDesc &self) {
              py::bytes name = self.Name();
              return name;
            },
            py::return_value_policy::reference)
-      .def("set_shape", &VarDescBind::SetShape)
-      .def("set_dtype", &VarDescBind::SetDataType)
-      .def("shape", &VarDescBind::Shape, py::return_value_policy::reference)
-      .def("dtype", &VarDescBind::GetDataType)
-      .def("lod_level", &VarDescBind::GetLodLevel)
-      .def("set_lod_level", &VarDescBind::SetLoDLevel)
-      .def("type", &VarDescBind::GetType)
-      .def("set_type", &VarDescBind::SetType)
-      .def("serialize_to_string", SerializeMessage<VarDescBind>)
-      .def("persistable", &VarDescBind::Persistable)
-      .def("set_persistable", &VarDescBind::SetPersistable);
+      .def("set_shape", &VarDesc::SetShape)
+      .def("set_dtype", &VarDesc::SetDataType)
+      .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
+      .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
+      .def("lod_level", &VarDesc::GetLodLevel)
+      .def("set_lod_level", &VarDesc::SetLoDLevel)
+      .def("type", &VarDesc::GetType)
+      .def("set_type", &VarDesc::SetType)
+      .def("serialize_to_string", SerializeMessage<VarDesc>)
+      .def("persistable", &VarDesc::Persistable)
+      .def("set_persistable", &VarDesc::SetPersistable);
 
-  py::enum_<VarDesc::VarType>(var_desc, "VarType", "")
-      .value("LOD_TENSOR", VarDesc::LOD_TENSOR)
-      .value("SELECTED_ROWS", VarDesc::SELECTED_ROWS)
-      .value("FEED_MINIBATCH", VarDesc::FEED_MINIBATCH)
-      .value("FETCH_LIST", VarDesc::FETCH_LIST)
-      .value("STEP_SCOPES", VarDesc::STEP_SCOPES)
-      .value("LOD_RANK_TABLE", VarDesc::LOD_RANK_TABLE)
-      .value("LOD_TENSOR_ARRAY", VarDesc::LOD_TENSOR_ARRAY);
+  py::enum_<proto::VarDesc::VarType>(var_desc, "VarType", "")
+      .value("LOD_TENSOR", proto::VarDesc::LOD_TENSOR)
+      .value("SELECTED_ROWS", proto::VarDesc::SELECTED_ROWS)
+      .value("FEED_MINIBATCH", proto::VarDesc::FEED_MINIBATCH)
+      .value("FETCH_LIST", proto::VarDesc::FETCH_LIST)
+      .value("STEP_SCOPES", proto::VarDesc::STEP_SCOPES)
+      .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE)
+      .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY);
 }
 
 void BindOpDesc(py::module &m) {
-  py::enum_<AttrType>(m, "AttrType", "")
-      .value("INT", AttrType::INT)
-      .value("INTS", AttrType::INTS)
-      .value("FLOAT", AttrType::FLOAT)
-      .value("FLOATS", AttrType::FLOATS)
-      .value("STRING", AttrType::STRING)
-      .value("STRINGS", AttrType::STRINGS)
-      .value("BOOL", AttrType::BOOLEAN)
-      .value("BOOLS", AttrType::BOOLEANS)
-      .value("BLOCK", AttrType::BLOCK);
+  py::enum_<proto::AttrType>(m, "AttrType", "")
+      .value("INT", proto::AttrType::INT)
+      .value("INTS", proto::AttrType::INTS)
+      .value("FLOAT", proto::AttrType::FLOAT)
+      .value("FLOATS", proto::AttrType::FLOATS)
+      .value("STRING", proto::AttrType::STRING)
+      .value("STRINGS", proto::AttrType::STRINGS)
+      .value("BOOL", proto::AttrType::BOOLEAN)
+      .value("BOOLS", proto::AttrType::BOOLEANS)
+      .value("BLOCK", proto::AttrType::BLOCK);
 
-  py::class_<OpDescBind> op_desc(m, "OpDesc", "");
-  op_desc.def("type", &OpDescBind::Type)
-      .def("set_type", &OpDescBind::SetType)
-      .def("input", &OpDescBind::Input)
-      .def("input_names", &OpDescBind::InputNames)
-      .def("set_input", &OpDescBind::SetInput)
-      .def("output", &OpDescBind::Output)
-      .def("output_names", &OpDescBind::OutputNames)
-      .def("set_output", &OpDescBind::SetOutput)
-      .def("has_attr", &OpDescBind::HasAttr)
-      .def("attr_type", &OpDescBind::GetAttrType)
-      .def("attr_names", &OpDescBind::AttrNames)
-      .def("set_attr", &OpDescBind::SetAttr)
-      .def("attr", &OpDescBind::GetAttr)
-      .def("set_block_attr", &OpDescBind::SetBlockAttr)
-      .def("block_attr", &OpDescBind::GetBlockAttr)
-      .def("check_attrs", &OpDescBind::CheckAttrs)
-      .def("infer_shape", &OpDescBind::InferShape)
-      .def("infer_var_type", &OpDescBind::InferVarType)
-      .def("serialize_to_string", SerializeMessage<OpDescBind>);
+  py::class_<OpDesc> op_desc(m, "OpDesc", "");
+  op_desc
+      .def("__init__", [](OpDesc &self) { new (&self) OpDesc(); },
+           py::return_value_policy::reference)
+      .def("copy_from", &OpDesc::CopyFrom)
+      .def("type", &OpDesc::Type)
+      .def("set_type", &OpDesc::SetType)
+      .def("input", &OpDesc::Input)
+      .def("input_names", &OpDesc::InputNames)
+      .def("output", &OpDesc::Output)
+      .def("output_names", &OpDesc::OutputNames)
+      .def("set_input", &OpDesc::SetInput)
+      .def("set_output", &OpDesc::SetOutput)
+      .def("input_arg_names", &OpDesc::InputArgumentNames)
+      .def("output_arg_names", &OpDesc::OutputArgumentNames)
+      .def("rename_input", &OpDesc::RenameInput)
+      .def("rename_output", &OpDesc::RenameOutput)
+      .def("has_attr", &OpDesc::HasAttr)
+      .def("attr_type", &OpDesc::GetAttrType)
+      .def("attr_names", &OpDesc::AttrNames)
+      .def("set_attr", &OpDesc::SetAttr)
+      .def("attr", &OpDesc::GetAttr)
+      .def("set_block_attr", &OpDesc::SetBlockAttr)
+      .def("set_serialized_attr",
+           [](OpDesc &self, const std::string &name,
+              const py::bytes &seriralized) {
+             std::string ser(seriralized);
+             self.SetAttr(name, ser);
+           })
+      .def("block_attr", &OpDesc::GetBlockAttr)
+      .def("check_attrs", &OpDesc::CheckAttrs)
+      .def("infer_shape", &OpDesc::InferShape)
+      .def("infer_var_type", &OpDesc::InferVarType)
+      .def("serialize_to_string", SerializeMessage<OpDesc>);
 }
 
 }  // namespace pybind
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 1faf24bcb8828596ec37abde9e699f46526e41df..04485ce7c1ab87f8655b0e6cbaecc36b3382f647 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -16,11 +16,11 @@ limitations under the License. */
 
 #include <mutex>  // for call_once
 #include <unordered_map>
-#include "gflags/gflags.h"
 #include "paddle/framework/backward.h"
 #include "paddle/framework/executor.h"
 #include "paddle/framework/feed_fetch_method.h"
 #include "paddle/framework/framework.pb.h"
+#include "paddle/framework/init.h"
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/lod_tensor_array.h"
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/operators/net_op.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
+#include "paddle/pybind/const_value.h"
 #include "paddle/pybind/exception.h"
 #include "paddle/pybind/pybind.h"
 #include "paddle/pybind/tensor_py.h"
@@ -51,24 +52,6 @@ static size_t UniqueIntegerGenerator(const std::string &prefix) {
   return generators[prefix].fetch_add(1);
 }
 
-std::once_flag gflags_init_flag;
-
-// TODO(qijun) move init gflags to init.cc
-void InitGflags(std::vector<std::string> &argv) {
-  std::call_once(gflags_init_flag, [&]() {
-    int argc = argv.size();
-    char **arr = new char *[argv.size()];
-    std::string line;
-    for (size_t i = 0; i < argv.size(); i++) {
-      arr[i] = &argv[i][0];
-      line += argv[i];
-      line += ' ';
-    }
-    google::ParseCommandLineFlags(&argc, &arr, true);
-    VLOG(1) << "Init commandline: " << line;
-  });
-}
-
 bool IsCompileGPU() {
 #ifndef PADDLE_WITH_CUDA
   return false;
@@ -95,8 +78,12 @@ PYBIND11_PLUGIN(core) {
            [](Tensor &self, const std::vector<int64_t> &dim) {
              self.Resize(make_ddim(dim));
            })
+      .def("set_layout",
+           [](Tensor &self, const std::string &layout) {
+             self.set_layout(StringToDataLayout(layout));
+           })
       .def("alloc_float",
-           [](Tensor &self, paddle::platform::GPUPlace &place) {
+           [](Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<float>(place);
            })
       .def("alloc_float",
@@ -108,7 +95,7 @@ PYBIND11_PLUGIN(core) {
              self.mutable_data<int>(place);
            })
       .def("alloc_int",
-           [](Tensor &self, paddle::platform::GPUPlace &place) {
+           [](Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<int>(place);
            })
       .def("set", PyCPUTensorSetFromArray<float>)
@@ -282,38 +269,39 @@ All parameter, weight, gradient are variables in Paddle.
     }
     return ret_values;
   });
-  m.def("get_grad_op_descs",
-        [](const OpDescBind &op_desc,
-           const std::unordered_set<std::string> &no_grad_set,
-           std::unordered_map<std::string, std::string> &grad_to_var,
-           const std::vector<BlockDescBind *> &grad_sub_block) {
-          std::vector<std::unique_ptr<OpDescBind>> grad_op_descs =
-              framework::OpInfoMap::Instance()
-                  .Get(op_desc.Type())
-                  .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
-                                 grad_sub_block);
-          std::vector<OpDescBind *> grad_op_desc_ptrs(grad_op_descs.size());
-          std::transform(
-              grad_op_descs.begin(), grad_op_descs.end(),
-              grad_op_desc_ptrs.begin(),
-              [](std::unique_ptr<OpDescBind> &p) { return p.release(); });
-          return grad_op_desc_ptrs;
-        });
-  m.def("prune", [](const ProgramDescBind &origin,
+  m.def(
+      "get_grad_op_desc", [](const OpDesc &op_desc,
+                             const std::unordered_set<std::string> &no_grad_set,
+                             const std::vector<BlockDesc *> &grad_sub_block) {
+        std::unordered_map<std::string, std::string> grad_to_var;
+        std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
+            framework::OpInfoMap::Instance()
+                .Get(op_desc.Type())
+                .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
+                               grad_sub_block);
+        std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
+        std::transform(grad_op_descs.begin(), grad_op_descs.end(),
+                       grad_op_desc_ptrs.begin(),
+                       [](std::unique_ptr<OpDesc> &p) { return p.release(); });
+        return std::make_pair(grad_op_desc_ptrs, grad_to_var);
+      });
+  m.def("prune", [](const ProgramDesc &origin,
                     const std::vector<std::array<size_t, 2>> &targets) {
-    ProgramDescBind prog_with_targets(origin);
+    ProgramDesc prog_with_targets(origin);
     for (const auto &t : targets) {
       prog_with_targets.MutableBlock(t[0])->Op(t[1])->MarkAsTarget();
     }
-    ProgramDesc pruned_desc;
+    proto::ProgramDesc pruned_desc;
     Prune(*prog_with_targets.Proto(), &pruned_desc);
-    return new ProgramDescBind(pruned_desc);
+    return new ProgramDesc(pruned_desc);
   });
-  m.def("inference_optimize", [](ProgramDescBind &origin) {
-    ProgramDesc pruned_desc;
+  m.def("inference_optimize", [](ProgramDesc &origin) {
+    proto::ProgramDesc pruned_desc;
     InferenceOptimize(*(origin.Proto()), &pruned_desc);
-    return new ProgramDescBind(pruned_desc);
+    return new ProgramDesc(pruned_desc);
   });
+  m.def("empty_var_name", []() { return framework::kEmptyVarName; });
+  m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; });
   m.def_submodule(
        "var_names",
        "The module will return special predefined variable name in Paddle")
@@ -327,10 +315,10 @@ All parameter, weight, gradient are variables in Paddle.
                     return new paddle::platform::CPUDeviceContext();
                   })
       .def_static("create",
-                  [](paddle::platform::GPUPlace& place)
+                  [](paddle::platform::CUDAPlace& place)
                       -> paddle::platform::DeviceContext* {
 #ifndef PADDLE_WITH_CUDA
-                    PADDLE_THROW("GPUPlace is not supported in CPU device.");
+                    PADDLE_THROW("CUDAPlace is not supported in CPU device.");
 #else
                     return new paddle::platform::CUDADeviceContext(place);
 #endif
@@ -340,9 +328,9 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_CUDA
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
-  py::class_<platform::GPUPlace>(m, "GPUPlace")
+  py::class_<platform::CUDAPlace>(m, "CUDAPlace")
       .def(py::init<int>())
-      .def("__str__", string::to_string<const platform::GPUPlace &>);
+      .def("__str__", string::to_string<const platform::CUDAPlace &>);
 
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
       .def(py::init<>())
@@ -355,14 +343,14 @@ All parameter, weight, gradient are variables in Paddle.
              self = cpu_place;
            })
       .def("set_place",
-           [](platform::Place &self, const platform::GPUPlace &gpu_place) {
+           [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
              self = gpu_place;
            });
 
   py::class_<OperatorBase>(m, "Operator")
       .def_static("create",
                   [](py::bytes protobin) {
-                    OpDesc desc;
+                    proto::OpDesc desc;
                     PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
                                    "Cannot parse user input to OpDesc");
                     PADDLE_ENFORCE(desc.IsInitialized(),
@@ -377,10 +365,10 @@ All parameter, weight, gradient are variables in Paddle.
            })
       .def("run",
            [](OperatorBase &self, const Scope &scope,
-              const platform::DeviceContext &dev_ctx) {
-             self.Run(scope, dev_ctx);
-             dev_ctx.Wait();
-           })
+              const platform::CPUPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::CUDAPlace &place) { self.Run(scope, place); })
       .def("type",
            [](const OperatorBase &op) -> std::string { return op.Type(); })
       .def("outputs",
@@ -415,7 +403,7 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
       .def_static("create",
                   [](py::bytes protobin) -> operators::CondOp * {
-                    OpDesc desc;
+                    proto::OpDesc desc;
                     PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
                                    "Cannot parse user input to OpDesc");
                     PADDLE_ENFORCE(desc.IsInitialized(),
@@ -434,11 +422,12 @@ All parameter, weight, gradient are variables in Paddle.
            });
 
   py::class_<framework::Executor>(m, "Executor")
-      .def(py::init<std::vector<platform::Place> &>())
+      .def(py::init<const platform::Place &>())
       .def("run", &Executor::Run);
 
   m.def("unique_integer", UniqueIntegerGenerator);
-  m.def("init_gflags", InitGflags);
+  m.def("init_gflags", framework::InitGflags);
+  m.def("init_devices", &framework::InitDevices);
 
   m.def("is_compile_gpu", IsCompileGPU);
   m.def("set_feed_variable", framework::SetFeedVariable);
@@ -448,6 +437,7 @@ All parameter, weight, gradient are variables in Paddle.
   BindBlockDesc(m);
   BindVarDsec(m);
   BindOpDesc(m);
+  BindConstValue(m);
 
   py::class_<framework::LoDRankTable>(m, "LodRankTable")
       .def("items", [](framework::LoDRankTable &table) {
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index 41fa658502d341fe9653a3e99b58498fcaeada47..4d5e73e2c266b301de4f19e09be7ab4009c936d3 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -1,21 +1,22 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <string>
 #include "paddle/framework/tensor.h"
 #include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 
@@ -61,13 +62,17 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
         auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
         auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
             tensor.dims(), platform::CPUPlace()));
-        // TODO(qijun): Here we use default CUDA stream to set GPU Tensor to
-        // a Python numpy array. It's better to manage CDUA stream unifiedly.
-        paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
-                                        sizeof(CUR_TYPE) * tensor.numel(),
-                                        cudaMemcpyDeviceToHost);
+
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
+            pool.Get(tensor.place()));
+
+        paddle::platform::GpuMemcpyAsync(
+            dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
+            cudaMemcpyDeviceToHost, dev_ctx->stream());
 #else
-        PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
+        PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
 #endif
       } else if (paddle::platform::is_cpu_place(tensor.place())) {
         dst_tensor = tensor;
@@ -123,7 +128,7 @@ template <typename T>
 void PyCUDATensorSetFromArray(
     framework::Tensor &self,
     py::array_t<T, py::array::c_style | py::array::forcecast> array,
-    paddle::platform::GPUPlace &place) {
+    paddle::platform::CUDAPlace &place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
@@ -132,10 +137,12 @@ void PyCUDATensorSetFromArray(
 
   self.Resize(framework::make_ddim(dims));
   auto *dst = self.mutable_data<T>(place);
-  // TODO(qijun): Here we use default CUDA stream to set a Python numpy
-  // array to a GPU Tensor. It's better to manage CDUA stream unifiedly.
-  paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
-                                  cudaMemcpyHostToDevice);
+
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto dev_ctx =
+      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
+  paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
+                                   cudaMemcpyHostToDevice, dev_ctx->stream());
 }
 #endif
 
diff --git a/paddle/scripts/CMakeLists.txt b/paddle/scripts/CMakeLists.txt
index a52f06fe497dac467e4ef2543ebda7a423ca326d..68cb5a19f99ab5148b04d193eb2356588bdc5a59 100644
--- a/paddle/scripts/CMakeLists.txt
+++ b/paddle/scripts/CMakeLists.txt
@@ -5,11 +5,3 @@ configure_file(submit_local.sh.in
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle DESTINATION bin
         PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
             GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
-
-configure_file(tools/usage_stat/usage.sh
-    paddle_usage
-    @ONLY)
-
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle_usage DESTINATION opt/paddle/bin
-        PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 43d2d1b410fa86dc0ab213cba0c2a488770ea1c7..a94bc01b358c508132eb85920a2d4c0aa934dd51 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -165,9 +165,6 @@ case "$1" in
     "make_diagram")
         python -m paddle.utils.make_model_diagram ${@:2}
         ;;
-    "usage")
-        $PADDLE_BIN_PATH/paddle_usage ${@:2}
-        ;;
     "version")
         version
         ;;
diff --git a/paddle/scripts/tools/usage_stat/usage.sh b/paddle/scripts/tools/usage_stat/usage.sh
deleted file mode 100755
index 7dbd1f58842f50ea1df0e2476c4a493569b1dda9..0000000000000000000000000000000000000000
--- a/paddle/scripts/tools/usage_stat/usage.sh
+++ /dev/null
@@ -1,168 +0,0 @@
-#!/bin/bash
-
-ARGPARSE=`getopt -o u:vin:l:e: --long git-user:,help,dry-run,task-name:,log-file:,exit-code:  -- "$@"`
-KEEP_ANONYMOUS="A_USER_DOES_NOT_TELL_US"
-# paddle config home dir, same as paddle
-PADDLE_CONF_HOME="$HOME/.config/paddle"
-# api url, mirror url(s) will be append later
-PD_URLS="http://api.paddlepaddle.org/version"
-
-usage()
-{
-    echo "Usage: `basename $0` [options]"
-    echo "Options:"
-    echo "  -e, --exit-code=EXIT_CODE         The train/predict process's exit code"
-    echo "  -l, --log-file=LOG_FILE_PATH      Read which log file to get the duration of process"
-    echo "  -n, --task-name=TASK_NAME         The name of demo or example"
-    echo "  -u, --git-user=GITHUB_USER        provide contact info, like username or email"
-    echo "  -v, -i                            Verbose output and interact with user when necessary"
-    echo " --help                             display this help message"
-}
-
-eval set -- "${ARGPARSE}"
-while true; do
-    case "$1" in
-        -l|--log-file)
-            log_file=$2
-            shift 2
-            ;;
-        -e|--exit-code)
-            exit_code=$2
-            shift 2
-            ;;
-        -u|--git-user)
-            github_user=$2
-            shift 2
-            ;;
-        -n|--task-name)
-            task=$2
-            shift 2
-            ;;
-        -v|-i)
-            v=1
-            shift
-            ;;
-        --dry-run)
-            dry_run=1
-            shift
-            ;;
-        --)
-            shift
-            break
-            ;;
-        --help)
-            usage
-            exit 0
-            ;;
-        *)
-            echo "Invalid option $1"
-            usage
-            exit 1
-            ;;
-    esac
-done
-
-# parse the log_file to get the time costs
-if [ -s "${log_file}" ]; then
-    duration=`awk 'BEGIN{day=0;last_sec=0;min_sec=0;max_sec=0;}
-    {if(index($2,":")==3){
-        t=substr($2,1,8);
-        sec=day*86400+substr(t,1,2)*3600+substr(t,4,2)*60+substr(t,7,2);
-        if(sec<last_sec-600){day+=1;sec+=86400;}
-        last_sec=sec;
-        if(min_sec==0 || min_sec>sec){min_sec=sec;}
-        if(max_sec==0 || max_sec<sec){max_sec=sec;}
-    }}
-    END{print max_sec-min_sec}' ${log_file}`
-else
-    duration=-1
-fi
-if [ "${v}" = "1" ]; then echo "duration: ${duration}"; fi
-
-# try find the user/email if not given
-if [ -z "${github_user}" ]; then
-    # search for cached username
-    if [ -s "${PADDLE_CONF_HOME}/github_user" ]; then
-        if [ "${v}" = "1" ]; then echo "read github_user from cache..."; fi
-        github_user=`cat ${PADDLE_CONF_HOME}/github_user`
-    else
-        # search the github-user from git config
-        if [ "${v}" = "1" ]; then echo "read github_user from git..."; fi
-        git_username=`git config --get user.name 2>/dev/null`
-        git_url=`git config --get remote.origin.url 2>/dev/null`
-        if [ "`echo ${git_url} | cut -b 1-19`" = "https://github.com/" ]; then
-            # under a git url, like https://github.com/user_xxx/proj_yyy.git
-            if [ "${v}" = "1" ]; then echo " from github url..."; fi
-            github_user=`echo ${git_url} | cut -d "/" -f 4`
-            if [ "${github_user}" = "PaddlePaddle" ]; then
-                github_user=
-            fi
-        fi
-        if [ -n "${git_username}" -a -z "${github_user}" ]; then
-            if [ "${v}" = "1" ]; then echo " from global git username..."; fi
-            github_user=${git_username}
-        fi
-    fi
-fi
-# allow user to set the user name, if it's not found
-if [ -z "${github_user}" -a "${v}" = "1" ]; then
-    read -p "Please input your github username or email, or just return to keep this feedback anonymous:"
-    github_user=${REPLY}
-    if [ -z "${github_user}" ]; then
-        # empty input, consider as one anonymous user
-        github_user="${KEEP_ANONYMOUS}"
-    fi
-fi
-if [ -n "${github_user}" -a -z "${dry_run}" ]; then
-    # valid user and not in dry-run mode, then save to cache
-    mkdir -p ${PADDLE_CONF_HOME}
-    echo "${github_user}" >${PADDLE_CONF_HOME}/github_user
-fi
-if [ "${v}" = "1" ]; then echo "username: ${github_user}"; fi
-if [ "${github_user}" = "${KEEP_ANONYMOUS}" ]; then
-    # anonymous user should keep the var empty.
-    github_user=
-fi
-
-# read local paddle version
-paddle_version=`paddle version | grep PaddlePaddle | head -n1 | cut -d " " -f 2 | cut -d "," -f 1`
-if [ "${v}" = "1" ]; then echo "version:${paddle_version}"; fi
-
-# read local system time
-system_time=`date "+%Y%m%d%H%M%S"`
-if [ "${v}" = "1" ]; then echo "system time:${system_time}"; fi
-
-# make empty job_name as default value.
-if [ -z "${task}" ]; then
-    task="(unknown_task)"
-fi
-if [ "${v}" = "1" ]; then echo "task: ${task}"; fi
-
-# concat the curl command
-params="content={\"data_type\":\"usage\",\
-\"system_time\":${system_time},\"paddle_version\":\"${paddle_version}\",\
-\"github_user\":\"${github_user}\",\"job_name\":\"${task}\",\
-\"duration\":${duration},\"exit_code\":\"${exit_code}\"\
-}&type=1"
-curl_cmd_prefix="curl -m 5 -X POST -d ${params}\
- -b ${PADDLE_CONF_HOME}/paddle.cookie -c ${PADDLE_CONF_HOME}/paddle.cookie "
-
-if [ "${dry_run}" = "1" ]; then
-    first_url=`echo ${PD_URLS} | cut -d " " -f 1`
-    echo "(dry-run mode)curl command: ${curl_cmd_prefix} ${first_url}"
-    exit 0
-else
-    for u in ${PD_URLS}; do
-        curl_cmd="${curl_cmd_prefix} ${u}"
-        if [ "${v}" = "1" ]; then echo "run: ${curl_cmd}"; fi
-        ${curl_cmd} >/dev/null 2>&1
-        if [ $? -eq 0 ]; then
-            if [ "${v}" = "1" ]; then echo "upload OK!"; fi
-            exit 0
-        else
-            if [ "${v}" = "1" ]; then echo "upload failed...try next"; fi
-        fi
-    done
-    if [ "${v}" = "1" ]; then echo "all urls tried but all failed...exit"; fi
-    exit 1
-fi
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index ff0bac6a0740111dfa1a1440daaf1ceaf3a7b0d8..0db8d33bbcb5278ed0dd5584b5822502b719ede9 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -14,9 +14,8 @@ make -j `nproc` print_operators_doc
 paddle/pybind/print_operators_doc > doc/en/html/operators.json
 
 # check websites for broken links
-# It will be failed now!
-#linkchecker doc/en/html/index.html
-#linkchecker doc/cn/html/index.html
+linkchecker doc/en/html/index.html
+linkchecker doc/cn/html/index.html
 
 # Parse Github URL
 REPO=`git config remote.origin.url`
diff --git a/paddle/string/to_string.h b/paddle/string/to_string.h
index 4f478b6a36b23bdba8ef3ddae94b3eadf18716c2..3b3bcc69a478045156225728236174fd601461dd 100644
--- a/paddle/string/to_string.h
+++ b/paddle/string/to_string.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <sstream>
diff --git a/paddle/string/to_string_test.cc b/paddle/string/to_string_test.cc
index 971484dd0c073762e99f3926576eb21b96197769..4956bd96fad5fd1decaad0a367135cb7d7ecaf6e 100644
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/string/to_string.h"
 #include <gtest/gtest.h>
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 8132742749e4a622720c66692c8d09815714ebea..77f84cd43bdf35ae6f54b0db2b5f720d24872878 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -6,7 +6,6 @@ if(WITH_TESTING)
   add_library(paddle_test_util STATIC TestUtil.cpp)
   add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
   if(NOT MOBILE_INFERENCE)
-    add_library(paddle_gtest_main STATIC paddle_gtest_main.cc)
-    add_dependencies(paddle_gtest_main paddle_memory gtest gflags)
+    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init paddle_memory gtest gflags)
   endif()
 endif()
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index a491322b7e533f7a9c263a249494440269391003..108ff335bf6b920c648d4bfebbd6a40ffb6fd939 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <cstring>
+
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
+#include "paddle/framework/init.h"
 #include "paddle/memory/memory.h"
 
 int main(int argc, char** argv) {
@@ -32,8 +34,11 @@ int main(int argc, char** argv) {
   google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
   testing::InitGoogleTest(&argc, argv);
   paddle::memory::Used(paddle::platform::CPUPlace());
+  std::vector<std::string> devs = {"CPU"};
 #ifdef PADDLE_WITH_CUDA
-  paddle::memory::Used(paddle::platform::GPUPlace(0));
+  paddle::memory::Used(paddle::platform::CUDAPlace(0));
+  devs.push_back("GPU:0");
 #endif
+  paddle::framework::InitDevices(devs);
   return RUN_ALL_TESTS();
 }
diff --git a/python/.gitignore b/python/.gitignore
index cc7d0ece4acaba2a3fa38a89110587fe8dffb992..1ba1d4c9b0301ed920f5303089e75dd3a8e4e3fa 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -2,6 +2,7 @@
 build
 dist
 paddle.egg-info
+paddlepaddle_gpu.egg-info
 .idea
 paddle/proto/*.py
 paddle/proto/*.pyc
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 7e118b24a4330af27ea6aa893fd87985b4443cdb..19e2ab1b7da7b1ceacd6842f2d74ac551497c77b 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -270,7 +270,7 @@ class LayerType(object):
     @staticmethod
     def is_layer_type(type_name):
         """
-        If type_name is a layer type.
+        Whether type_name is a layer type.
 
         :param type_name: layer type name. Because layer type enumerations are
                           strings.
@@ -441,7 +441,7 @@ def full_matrix_projection(input, size=0, param_attr=None):
        with mixed_layer(size=100) as m:
            m += full_matrix_projection(input=layer)
 
-    2. When used as an independant object like this, you must set the size:
+    2. When used as an independent object like this, you must set the size:
 
     .. code-block:: python
 
@@ -451,11 +451,11 @@ def full_matrix_projection(input, size=0, param_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param size: The parameter size. Means the width of parameter.
+    :param size: The dimension of this layer.
     :type size: int
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: A FullMatrixProjection Object.
+    :return: FullMatrixProjection Object.
     :rtype: FullMatrixProjection
     """
     proj = FullMatrixProjection(
@@ -468,12 +468,12 @@ def full_matrix_projection(input, size=0, param_attr=None):
 def trans_full_matrix_projection(input, size=0, param_attr=None):
     """
     Different from full_matrix_projection, this projection performs matrix
-    multiplication, using transpose of weight.
+    multiplication, using the transpose of weight.
 
     ..  math::
         out.row[i] += in.row[i] * w^\mathrm{T}
 
-    :math:`w^\mathrm{T}` means transpose of weight.
+    :math:`w^\mathrm{T}` means the transpose of weight.
     The simply usage is:
 
     .. code-block:: python
@@ -489,9 +489,9 @@ def trans_full_matrix_projection(input, size=0, param_attr=None):
     :type input: LayerOutput
     :param size: The parameter size. Means the width of parameter.
     :type size: int
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: A TransposedFullMatrixProjection Object.
+    :return: TransposedFullMatrixProjection Object.
     :rtype: TransposedFullMatrixProjection
     """
     proj = TransposedFullMatrixProjection(
@@ -521,7 +521,7 @@ def table_projection(input, size=0, param_attr=None):
        with mixed_layer(size=100) as m:
            m += table_projection(input=layer)
 
-    2. When used as an independant object like this, you must set the size:
+    2. When used as an independent object like this, you must set the size:
 
     .. code-block:: python
 
@@ -532,11 +532,11 @@ def table_projection(input, size=0, param_attr=None):
 
     :param input: The input of this layer, which must contains id fields.
     :type input: LayerOutput
-    :param size: The parameter size. Means the width of parameter.
+    :param size: The dimension of the output.
     :type size: int
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: A TableProjection Object.
+    :return: TableProjection Object.
     :rtype: TableProjection
     """
     proj = TableProjection(
@@ -547,7 +547,7 @@ def table_projection(input, size=0, param_attr=None):
 
 def identity_projection(input, offset=None, size=None):
     """
-    1. IdentityProjection if offset=None. It performs:
+    1. If offset=None, it performs IdentityProjection as follows:
 
     .. math::
        out.row[i] += in.row[i]
@@ -559,9 +559,8 @@ def identity_projection(input, offset=None, size=None):
        proj = identity_projection(input=layer)
 
 
-    2. IdentityOffsetProjection if offset!=None. It likes IdentityProjection,
-    but layer size may be smaller than input size.
-    It select dimesions [offset, offset+layer_size) from input:
+    2. If offset!=None, It executes IdentityOffsetProjection and takes the
+       elements of the input in the range [offset, offset+size) as output.
 
     .. math::
        out.row[i] += in.row[i + \\textrm{offset}]
@@ -573,14 +572,20 @@ def identity_projection(input, offset=None, size=None):
        proj = identity_projection(input=layer,
                                   offset=10)
 
-    Note that both of two projections should not have any parameter.
+    Note that neither of the projections have trainable parameter.
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param offset: Offset, None if use default.
+    :param offset: The offset from the start of the input. The input's
+                   elements in the range [offset, offset+size) will be
+                   taken as output. If this parameter is not set or set
+                   to None, the output will be the same as the input.
     :type offset: int
-    :return: A IdentityProjection or IdentityOffsetProjection object
-    :rtype: IdentityProjection or IdentityOffsetProjection
+    :param size: The dimension of this layer. It will be neglected
+                 when offset is None or not set.
+    :type size: int
+    :return: IdentityProjection or IdentityOffsetProjection object
+    :rtype: IdentityProjection | IdentityOffsetProjection
     """
     if offset is None:
         proj = IdentityProjection(input_layer_name=input.name)
@@ -596,8 +601,8 @@ def identity_projection(input, offset=None, size=None):
 
 def slice_projection(input, slices):
     """
-    slice_projection can slice the input value into multiple parts,
-    and then select some of them to merge into a new output.
+    slice_projection slices the input value into multiple parts,
+    then selects and merges some of them into a new output.
 
     .. math::
        output = [input.slices()]
@@ -608,15 +613,13 @@ def slice_projection(input, slices):
 
        proj = slice_projection(input=layer, slices=[(0, 10), (20, 30)])
 
-    Note that slice_projection should not have any parameter.
+    Note that slice_projection has no trainable parameter.
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param slices: An array of slice parameters.
-                   Each slice contains the start and end offsets based
-                   on the input.
-    :type slices: pair of int
-    :return: A SliceProjection object
+    :param slices: A list of start and end offsets of each slice.
+    :type slices: list of tuple
+    :return: SliceProjection object.
     :rtype: SliceProjection
     """
     assert len(slices) >= 1
@@ -636,8 +639,7 @@ def slice_projection(input, slices):
 @wrap_param_attr_default()
 def scaling_projection(input, param_attr=None):
     """
-    scaling_projection multiplies the input with a scalar parameter and add to
-    the output.
+    scaling_projection multiplies the input with a scalar parameter.
 
     .. math::
        out += w * in
@@ -650,9 +652,9 @@ def scaling_projection(input, param_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: A ScalingProjection object
+    :return: ScalingProjection object.
     :rtype: ScalingProjection
     """
     proj = ScalingProjection(input_layer_name=input.name, **param_attr.attr)
@@ -663,8 +665,8 @@ def scaling_projection(input, param_attr=None):
 @wrap_param_attr_default()
 def dotmul_projection(input, param_attr=None):
     """
-    DotMulProjection with a layer as input.
-    It performs element-wise multiplication with weight.
+    DotMulProjection takes a layer as input and performs
+    element-wise multiplication with weight.
 
     ..  math::
         out.row[i] += in.row[i] .* weight
@@ -679,9 +681,9 @@ def dotmul_projection(input, param_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: A DotMulProjection Object.
+    :return: DotMulProjection object.
     :rtype: DotMulProjection
     """
     proj = DotMulProjection(
@@ -698,7 +700,7 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs):
        out.row[i] += scale * (a.row[i] .* b.row[i])
 
     where :math:`.*` means element-wise multiplication, and
-    scale is a config scalar, its default value is one.
+    scale is a config scalar, its default value is 1.
 
     The example usage is:
 
@@ -706,13 +708,13 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs):
 
        op = dotmul_operator(a=layer1, b=layer2, scale=0.5)
 
-    :param a: Input layer1
+    :param a: The first input of this layer.
     :type a: LayerOutput
-    :param b: Input layer2
+    :param b: The second input of this layer.
     :type b: LayerOutput
-    :param scale: config scalar, default value is one.
+    :param scale: A scalar to scale the product. Its default value is 1.
     :type scale: float
-    :return: A DotMulOperator Object.
+    :return: DotMulOperator object.
     :rtype: DotMulOperator
     """
     if 'x' in kwargs or 'y' in kwargs:
@@ -738,28 +740,29 @@ def context_projection(input,
     """
     Context Projection.
 
-    It just simply reorganizes input sequence, combines "context_len" sequence
-    to one context from context_start. "context_start" will be set to
-    -(context_len - 1) / 2 by default. If context position out of sequence
+    It just reorganizes input sequence, combines "context_len" elements of the
+    sequence to one context from context_start. "context_start" will be set to
+    -(context_len - 1) / 2 by default. When context position is out of sequence
     length, padding will be filled as zero if padding_attr = False, otherwise
     it is trainable.
 
-    For example, origin sequence is [A B C D E F G], context len is 3, then
-    after context projection and not set padding_attr, sequence will
+    For example, origin sequence is [A B C D E F G], context len is 3, padding_attr
+    is not set, then after context projection, sequence will
     be [ 0AB ABC BCD CDE DEF EFG FG0 ].
 
     :param input: The input of this layer, which should be a sequence.
     :type input: LayerOutput
-    :param context_len: context length.
+    :param context_len: The length of the context.
     :type context_len: int
-    :param context_start: context start position. Default is
+    :param context_start: The start position of the context. The default value is
                           -(context_len - 1)/2
     :type context_start: int
-    :param padding_attr: Padding Parameter Attribute. If false, it means padding
-                         always be zero. Otherwise Padding is learnable, and
-                         parameter attribute is set by this parameter.
+    :param padding_attr: Parameter attribute of the padding. If the parameter is
+                         set to False, padding will be zero. In other cases, the
+                         padding is trainable, and its parameter attribute is set
+                         by this parameter.
     :type padding_attr: bool | ParameterAttribute
-    :return: Projection
+    :return: Projection object.
     :rtype: Projection
     """
     context_start = -(
@@ -791,10 +794,9 @@ class MixedLayerType(LayerOutput):
 
     def __init__(self, name, size, act, bias_attr, layer_attr, parents=None):
         """
-        Ctor.
-        :param name: layer name.
+        :param name: The name of this layer.
         :type name: basestring
-        :param size: layer size.
+        :param size: The dimension of this layer.
         :type size: int
         :param act: Activation type.
         :type act: BaseActivation
@@ -802,8 +804,9 @@ class MixedLayerType(LayerOutput):
                           whose type is not ParameterAttribute, no bias is defined. If the
                           parameter is set to True, the bias is initialized to zero.
         :type bias_attr: ParameterAttribute | None | bool | Any
-        :param layer_attr: Extra Layer Attribute.
-        :type layer_attr: ExtraLayerAttribute or None
+        :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                           details.
+        :type layer_attr: ExtraLayerAttribute | None
         """
         LayerOutput.__init__(
             self,
@@ -868,12 +871,12 @@ def mixed_layer(size=0,
                 bias_attr=False,
                 layer_attr=None):
     """
-    Mixed Layer. A mixed layer will add all inputs together, then activate.
-    Each inputs is a projection or operator.
+    Mixed Layer. A mixed layer will add all inputs together, then activate the sum.
+    Each input is a projection or operator.
 
     There are two styles of usages.
 
-    1. When not set inputs parameter, use mixed_layer like this:
+    1. When the parameter input is not set, use mixed_layer like this:
 
     .. code-block:: python
 
@@ -889,21 +892,21 @@ def mixed_layer(size=0,
                        input=[full_matrix_projection(input=layer1),
                               full_matrix_projection(input=layer2)])
 
-    :param name: mixed layer name. Can be referenced by other layer.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param size: layer size.
+    :param size: The dimension of this layer.
     :type size: int
-    :param input: The input of this layer. It is an optional parameter. If set,
-                  then this function will just return layer's name.
+    :param input: The input of this layer. It is an optional parameter.
     :param act: Activation Type. LinearActivation is the default activation.
     :type act: BaseActivation
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer config. Default is None.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :return: MixedLayerType object can add inputs or layer name.
+    :return: MixedLayerType object.
     :rtype: MixedLayerType
     """
 
@@ -938,14 +941,15 @@ def data_layer(name, size, depth=None, height=None, width=None,
 
     :param name: The name of this layer.
     :type name: basestring
-    :param size: Size of this data layer.
+    :param size: The dimension of this data layer.
     :type size: int
-    :param height: Height of this data layer, used for image
+    :param height: The height of the input image data.
     :type height: int | None
-    :param width: Width of this data layer, used for image
+    :param width: The width of the input image data.
     :type width: int | None
-    :param layer_attr: Extra Layer Attribute.
-    :type layer_attr: ExtraLayerAttribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -978,14 +982,15 @@ def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input of this layer, which must be Index Data.
+    :param input: The input of this layer, whose type must be Index Data.
     :type input: LayerOutput
-    :param size: The embedding dimension.
+    :param size: The dimension of the embedding vector.
     :type size: int
     :param param_attr: The embedding parameter attribute. See ParameterAttribute
                       for details.
-    :type param_attr: ParameterAttribute | None
-    :param layer_attr: Extra layer Config. Default is None.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1013,7 +1018,7 @@ def fc_layer(input,
              bias_attr=None,
              layer_attr=None):
     """
-    Helper for declare fully connected layer.
+    The fully connected layer.
 
     The example usage is:
 
@@ -1035,17 +1040,18 @@ def fc_layer(input,
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput | list | tuple
-    :param size: The layer dimension.
+    :param size: The dimension of this layer.
     :type size: int
     :param act: Activation Type. TanhActivation is the default activation.
     :type act: BaseActivation
-    :param param_attr: The Parameter Attribute|list.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: Extra Layer config.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1086,13 +1092,15 @@ def fc_layer(input,
 @wrap_name_default("print")
 def printer_layer(input, format=None, name=None):
     """
-    Print the output value of input layers. This layer is useful for debugging.
+    Print the output value of the layers specified by the parameter input.
+    This layer is useful for debugging.
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput | list | tuple
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     if isinstance(input, LayerOutput):
         input = [input]
@@ -1135,11 +1143,12 @@ def priorbox_layer(input,
     :param aspect_ratio: The aspect ratio.
     :type aspect_ratio: list
     :param variance: The bounding box variance.
-    :type min_size: The min size of the priorbox width/height.
+    :type min_size: The minimum size of the priorbox width/height.
     :param min_size: list
-    :type max_size: The max size of the priorbox width/height. Could be NULL.
+    :type max_size: The maximum size of the priorbox width/height. It could be NULL.
     :param max_size: list
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     # plus one for ratio 1.
     num_filters = (len(aspect_ratio) * 2 + 1 + len(max_size)) * 4
@@ -1177,7 +1186,7 @@ def multibox_loss_layer(input_loc,
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input_loc: The input predict locations.
+    :param input_loc: The input predicted locations.
     :type input_loc: LayerOutput | List of LayerOutput
     :param input_conf: The input priorbox confidence.
     :type input_conf: LayerOutput | List of LayerOutput
@@ -1189,13 +1198,15 @@ def multibox_loss_layer(input_loc,
     :type num_classes: int
     :param overlap_threshold: The threshold of the overlap.
     :type overlap_threshold: float
-    :param neg_pos_ratio: The ratio of the negative bbox to the positive bbox.
+    :param neg_pos_ratio: The ratio of the negative bounding box to
+                          the positive bounding box.
     :type neg_pos_ratio: float
-    :param neg_overlap: The negative bbox overlap threshold.
+    :param neg_overlap: The negative bounding box overlap threshold.
     :type neg_overlap: float
     :param background_id: The background class index.
     :type background_id: int
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     if isinstance(input_loc, LayerOutput):
         input_loc = [input_loc]
@@ -1258,19 +1269,20 @@ def detection_output_layer(input_loc,
     :type input_conf: LayerOutput | List of LayerOutput.
     :param priorbox: The input priorbox location and the variance.
     :type priorbox: LayerOutput
-    :param num_classes: The number of the classification.
+    :param num_classes: The number of the classes.
     :type num_classes: int
     :param nms_threshold: The Non-maximum suppression threshold.
     :type nms_threshold: float
-    :param nms_top_k: The bbox number kept of the NMS's output
+    :param nms_top_k: The bounding boxes number kept of the NMS's output.
     :type nms_top_k: int
-    :param keep_top_k: The bbox number kept of the layer's output
+    :param keep_top_k: The bounding boxes number kept of the layer's output.
     :type keep_top_k: int
-    :param confidence_threshold: The classification confidence threshold
+    :param confidence_threshold: The classification confidence threshold.
     :type confidence_threshold: float
     :param background_id: The background class index.
     :type background_id: int
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     if isinstance(input_loc, LayerOutput):
         input_loc = [input_loc]
@@ -1326,7 +1338,7 @@ def roi_pool_layer(input,
     A layer used by Fast R-CNN to extract feature maps of ROIs from the last
     feature map.
 
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input layer.
     :type input: LayerOutput.
@@ -1338,9 +1350,10 @@ def roi_pool_layer(input,
     :type pooled_height: int
     :param spatial_scale: The spatial scale between the image and feature map.
     :type spatial_scale: float
-    :param num_channels: number of input channel.
+    :param num_channels: The number of the input channels.
     :type num_channels: int
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     if num_channels is None:
         assert input.num_filters is not None
@@ -1361,18 +1374,19 @@ def roi_pool_layer(input,
 @wrap_name_default("cross_channel_norm")
 def cross_channel_norm_layer(input, name=None, param_attr=None):
     """
-    Normalize a layer's output. This layer is necessary for ssd.
-    This layer applys normalize across the channels of each sample to
-    a conv layer's output and scale the output by a group of trainable
-    factors which dimensions equal to the channel's number.
+    Normalize a layer's output. This layer is necessary for ssd. This
+    layer applys normalization across the channels of each sample to
+    a convolutional layer's output and scales the output by a group of
+    trainable factors whose dimensions equal to the channel's number.
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param param_attr: The Parameter Attribute|list.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     assert input.num_filters is not None
     Layer(
@@ -1413,12 +1427,9 @@ def pooling_layer(input,
     Pooling layer for sequence inputs, not used for Image.
 
     If stride > 0, this layer slides a window whose size is determined by stride,
-    and return the pooling value of the window as the output. Thus, a long sequence
-    will be shorten.
-
-    The parameter stride specifies the intervals at which to apply the pooling
-    operation. Note that for sequence with sub-sequence, the default value
-    of stride is -1.
+    and returns the pooling value of the sequence in the window as the output. Thus,
+    a long sequence will be shortened. Note that for sequence with sub-sequence, the
+    default value of stride is -1.
 
     The example usage is:
 
@@ -1435,16 +1446,16 @@ def pooling_layer(input,
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param pooling_type: Type of pooling, MaxPooling(default), AvgPooling,
-                         SumPooling, SquareRootNPooling.
+    :param pooling_type: Type of pooling. MaxPooling is the default pooling.
     :type pooling_type: BasePoolingType | None
     :param stride: The step size between successive pooling regions.
-    :type stride: Int
+    :type stride: int
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The Extra Attributes for layer, such as dropout.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6618,7 +6629,7 @@ def row_conv_layer(input,
     .. math::
 
         r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}}
-                  \quad \text{for} \quad  (1 \leq i \leq d)
+                  \quad \\text{for} \quad  (1 \leq i \leq d)
 
     Note:
         The `context_len` is `k + 1`. That is to say, the lookahead step
@@ -6767,7 +6778,7 @@ def gated_unit_layer(input,
     The gated unit layer implements a simple gating mechanism over the input.
     The input :math:`X` is first projected into a new space :math:`X'`, and
     it is also used to produce a gate weight :math:`\sigma`. Element-wise
-    product between :match:`X'` and :math:`\sigma` is finally returned.
+    product between :math:`X'` and :math:`\sigma` is finally returned.
 
     Reference:
         `Language Modeling with Gated Convolutional Networks
@@ -7463,7 +7474,7 @@ def factorization_machine(input,
     Factorization Machine with the formula:
 
     .. math::
-        y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+        y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \\rangle x_i x_j
 
     Note:
         X is the input vector with size n. V is the factor matrix. Each row of V
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 9776ae18057d57dd994fac8b62090258252922c6..b5cde7bac779ee1d54395b68941df2693e1ed0f5 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -25,10 +25,10 @@ from paddle.trainer.config_parser import *
 __all__ = [
     'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
     "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
-    'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru',
-    'simple_attention', 'dot_product_attention', 'multi_head_attention',
-    'simple_gru2', 'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm',
-    'inputs', 'outputs'
+    'img_conv_group', 'img_separable_conv', 'vgg_16_network', 'gru_unit',
+    'gru_group', 'simple_gru', 'simple_attention', 'dot_product_attention',
+    'multi_head_attention', 'simple_gru2', 'bidirectional_gru',
+    'text_conv_pool', 'bidirectional_lstm', 'inputs', 'outputs'
 ]
 
 ######################################################
@@ -251,13 +251,13 @@ def img_conv_bn_pool(input,
                      pool_layer_attr=None):
     """
     Convolution, batch normalization, pooling group.
-    
+
     Img input => Conv => BN => Pooling => Output.
 
     :param name: group name.
     :type name: basestring
     :param input: input layer.
-    :type input: LayerOutput 
+    :type input: LayerOutput
     :param filter_size: see img_conv_layer for details.
     :type filter_size: int
     :param num_filters: see img_conv_layer for details.
@@ -435,6 +435,85 @@ def img_conv_group(input,
         input=tmp, stride=pool_stride, pool_size=pool_size, pool_type=pool_type)
 
 
+@wrap_name_default("separable_conv")
+def img_separable_conv(input,
+                       num_channels,
+                       num_out_channels,
+                       filter_size,
+                       stride=1,
+                       padding=0,
+                       depth_multiplier=1,
+                       act=None,
+                       bias_attr=None,
+                       param_attr=None,
+                       shared_bias=True,
+                       layer_type='exconv',
+                       name=None):
+    """
+    Separable Convolution.
+
+    The separable convolution module is consisted of a depthwise convolution
+    that acts separately on input channels, followed by a pointwise convolution
+    with 1*1 kernels that mixes channels. It is used for Xception:
+    https://arxiv.org/pdf/1610.02357.pdf
+
+    :param input: input layer.
+    :type input: LayerOutput
+    :param num_channels: the number of input channels.
+    :type num_channels: int
+    :param num_out_channels: the number of output channels.
+    :type num_out_channels: int
+    :param filter_size: the filter size for the depthwise convolution.
+    :type filter_size: int|tuple
+    :param stride: the stride size for the depthwise convolution.
+    :type stride: int|tuple
+    :param padding: the padding size for the depthwise convolution.
+    :type padding: int|tuple
+    :param depth_multiplier: the number of filter for one channel in the
+                             depthwize convolution.
+    :type depth_multiplier: int
+    :param act: the activation function for the output.
+    :type act: BaseActivation
+    :param bias_attr: see img_conv_layer for details.
+    :type bias_attr: ParameterAttribute
+    :param param_attr: see img_conv_layer for details.
+    :type param_attr: ParameterAttribute
+    :param shared_bias: see img_conv_layer for details.
+    :type shared_bias: bool
+    :param layer_type: see img_conv_layer for details.
+    :type layer_type: bool
+    :return: layer's output
+    :rtype: LayerOutput
+    """
+    __depthwise_conv__ = img_conv_layer(
+        name="%s_depthwise_conv" % name,
+        input=input,
+        num_channels=num_channels,
+        num_filters=num_channels * depth_multiplier,
+        groups=num_channels,
+        filter_size=filter_size,
+        stride=stride,
+        padding=padding,
+        act=LinearActivation(),
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        shared_biases=shared_bias,
+        layer_type=layer_type)
+    __pointwise_conv__ = img_conv_layer(
+        name="%s_pointwise_conv" % name,
+        input=__depthwise_conv__,
+        num_channels=num_channels * depth_multiplier,
+        num_filters=num_out_channels,
+        filter_size=1,
+        stride=1,
+        padding=0,
+        act=act,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        shared_biases=shared_bias)
+    return __pointwise_conv__
+
+
 def small_vgg(input_image, num_channels, num_classes):
     def __vgg__(ipt, num_filter, times, dropouts, num_channels_=None):
         return img_conv_group(
@@ -648,7 +727,7 @@ def lstmemory_unit(input,
                    lstm_bias_attr=None,
                    lstm_layer_attr=None):
     """
-    lstmemory_unit defines the caculation process of a LSTM unit during a 
+    lstmemory_unit defines the caculation process of a LSTM unit during a
     single time step. This function is not a recurrent layer, so it can not be
     directly used to process sequence input. This function is always used in
     recurrent_group (see layers.py for more details) to implement attention
@@ -869,7 +948,7 @@ def gru_unit(input,
              gru_layer_attr=None,
              naive=False):
     """
-    gru_unit defines the calculation process of a gated recurrent unit during a single 
+    gru_unit defines the calculation process of a gated recurrent unit during a single
     time step. This function is not a recurrent layer, so it can not be
     directly used to process sequence input. This function is always used in
     the recurrent_group (see layers.py for more details) to implement attention
@@ -1012,7 +1091,7 @@ def simple_gru(input,
     simple_gru in network.py. The reason why there are so many interfaces is
     that we have two ways to implement recurrent neural network. One way is to
     use one complete layer to implement rnn (including simple rnn, gru and lstm)
-    with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But 
+    with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But
     the multiplication operation :math:`W x_t` is not computed in these layers.
     See details in their interfaces in layers.py.
     The other implementation is to use an recurrent group which can ensemble a
@@ -1116,11 +1195,12 @@ def simple_gru2(input,
     :type act: BaseActivation
     :param gate_act: gate activiation type of gru
     :type gate_act: BaseActivation
-    :param gru_bias_attr: bias parameter attribute of gru layer, 
+    :param gru_bias_attr: bias parameter attribute of gru layer,
                           False means no bias, None means default bias.
     :type gru_bias_attr: ParameterAttribute|False|None
-    :param gru_layer_attr: Extra attribute of the gru layer.
-    :type gru_layer_attr: ExtraLayerAttribute
+    :param gru_param_attr: param parameter attribute of gru layer,
+                          None means default param.
+    :type gru_param_attr: ParameterAttribute|None
     :return: the gru group.
     :rtype: LayerOutput
     """
@@ -1188,7 +1268,7 @@ def bidirectional_gru(input,
     :type size: int
     :param return_seq: If set False, the last time step of output are
                        concatenated and returned.
-                       If set True, the entire output sequences in forward 
+                       If set True, the entire output sequences in forward
                        and backward directions are concatenated and returned.
     :type return_seq: bool
     :return: LayerOutput object.
@@ -1277,7 +1357,7 @@ def bidirectional_lstm(input,
     :type size: int
     :param return_seq: If set False, the last time step of output are
                        concatenated and returned.
-                       If set True, the entire output sequences in forward 
+                       If set True, the entire output sequences in forward
                        and backward directions are concatenated and returned.
     :type return_seq: bool
     :return: LayerOutput object.
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
index cfc1c886e1389c15e3f803c341b6f62dd7b4bf41..21ed7f7a5ce279f5bc65e5b008f14a1b0ff97343 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -23,10 +23,9 @@ Besides, this module also provides API for building dictionary.
 import paddle.v2.dataset.common
 import collections
 import tarfile
-import Queue
 import re
 import string
-import threading
+import random
 
 __all__ = ['build_dict', 'train', 'test', 'convert']
 
@@ -74,47 +73,21 @@ def build_dict(pattern, cutoff):
     return word_idx
 
 
-def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size):
+def reader_creator(pos_pattern, neg_pattern, word_idx):
     UNK = word_idx['<unk>']
+    INS = []
 
-    qs = [Queue.Queue(maxsize=buffer_size), Queue.Queue(maxsize=buffer_size)]
-
-    def load(pattern, queue):
+    def load(pattern, out, label):
         for doc in tokenize(pattern):
-            queue.put(doc)
-        queue.put(None)
+            out.append(([word_idx.get(w, UNK) for w in doc], label))
+
+    load(pos_pattern, INS, 0)
+    load(neg_pattern, INS, 1)
+    random.shuffle(INS)
 
     def reader():
-        # Creates two threads that loads positive and negative samples
-        # into qs.
-        t0 = threading.Thread(
-            target=load, args=(
-                pos_pattern,
-                qs[0], ))
-        t0.daemon = True
-        t0.start()
-
-        t1 = threading.Thread(
-            target=load, args=(
-                neg_pattern,
-                qs[1], ))
-        t1.daemon = True
-        t1.start()
-
-        # Read alternatively from qs[0] and qs[1].
-        i = 0
-        doc = qs[i].get()
-        while doc != None:
-            yield [word_idx.get(w, UNK) for w in doc], i % 2
-            i += 1
-            doc = qs[i % 2].get()
-
-        # If any queue is empty, reads from the other queue.
-        i += 1
-        doc = qs[i % 2].get()
-        while doc != None:
-            yield [word_idx.get(w, UNK) for w in doc], i % 2
-            doc = qs[i % 2].get()
+        for doc, label in INS:
+            yield doc, label
 
     return reader
 
@@ -133,7 +106,7 @@ def train(word_idx):
     """
     return reader_creator(
         re.compile("aclImdb/train/pos/.*\.txt$"),
-        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000)
+        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
 
 
 def test(word_idx):
@@ -150,7 +123,7 @@ def test(word_idx):
     """
     return reader_creator(
         re.compile("aclImdb/test/pos/.*\.txt$"),
-        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000)
+        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
 
 
 def word_dict():
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index 59986c9f0ca8e4b793463db0e8c5da0489654ee9..c72b5730695dbc4f772015f1fb8dec6814cd1837 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -15,13 +15,15 @@ import backward
 import regularizer
 from param_attr import ParamAttr
 from data_feeder import DataFeeder
-from core import LoDTensor, CPUPlace, GPUPlace
+from core import LoDTensor, CPUPlace, CUDAPlace
+from distribute_transpiler import DistributeTranspiler
+import clip
 
 Tensor = LoDTensor
 __all__ = framework.__all__ + executor.__all__ + [
     'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
-    'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr'
-    'DataFeeder'
+    'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor', 'ParamAttr'
+    'DataFeeder', 'clip', 'DistributeTranspiler'
 ]
 
 
@@ -40,5 +42,10 @@ def __read_gflags_from_env__():
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
 
+    if core.is_compile_gpu():
+        core.init_devices(["CPU", "GPU:0"])
+    else:
+        core.init_devices(["CPU"])
+
 
 __read_gflags_from_env__()
diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py
index f188582178f667125ec95cd230100fdb10ce7e88..6966cc75804b6b5a49ceb45a26994c23d2936bdb 100644
--- a/python/paddle/v2/fluid/backward.py
+++ b/python/paddle/v2/fluid/backward.py
@@ -1,17 +1,209 @@
 from paddle.v2.fluid import framework as framework
+from . import core
+import collections
 
-__all__ = ['append_backward_ops']
+__all__ = ['append_backward']
 
 
-def append_backward_ops(loss, parameter_list=None, no_grad_set=None):
+def _rename_arg_(op_desc_list, old_name, new_name, begin_idx=None,
+                 end_idx=None):
+    if begin_idx is None:
+        begin_idx = 0
+    if end_idx is None:
+        end_idx = len(op_desc_list)
+    for i in range(begin_idx, end_idx):
+        op_desc = op_desc_list[i]
+        if isinstance(op_desc, tuple):
+            op_desc = op_desc[0]
+        op_desc.rename_input(old_name, new_name)
+        op_desc.rename_output(old_name, new_name)
+
+
+def _create_op_desc_(op_type, inputs, outputs, attrs):
+    op_desc = core.OpDesc()
+    op_desc.set_type(op_type)
+    for para, args in inputs.iteritems():
+        op_desc.set_input(para, args)
+    for para, args in outputs.iteritems():
+        op_desc.set_output(para, args)
+    for name, val in attrs.iteritems():
+        if isinstance(val, framework.Block):
+            op_desc.set_block_attr(name, val.desc)
+        else:
+            op_desc.set_attr(name, val)
+    return op_desc
+
+
+def _infer_var_data_type_(var_name, block):
+    grad_var = block.desc.find_var(var_name.encode("ascii"))
+    fwd_name = _strip_grad_suffix_(var_name.encode("ascii"))
+    if block.desc.has_var_recursive(fwd_name):
+        fwd_var = block.desc.find_var_recursive(fwd_name.encode("ascii"))
+        grad_var.set_dtype(fwd_var.dtype())
+    else:
+        grad_var.set_dtype(core.DataType.FP32)
+
+
+def _all_in_set_(cands, s):
+    for c in cands:
+        if not c in s:
+            return False
+    return True
+
+
+def _strip_grad_suffix_(name):
+    pos = name.find(core.grad_var_suffix())
+    return name[:pos] if pos != -1 else name
+
+
+def _append_grad_suffix_(name):
+    return name + core.grad_var_suffix()
+
+
+def _addup_repetitive_outputs_(op_descs):
+    # In backward part, an variable my be the output of more than one ops.
+    # In this case, the variable should be the accumulation of all the outputs.
+    # We adopt adding `sum_op`s to implement the accumulate.
+    pending_sum_ops = []
+    var_rename_count = collections.defaultdict(int)
+    renamed_vars = collections.defaultdict(list)
+    for idx, op_desc in enumerate(op_descs):
+        for var_name in op_desc.input_arg_names():
+            if len(renamed_vars[var_name]) > 1:
+                pending_sum_ops.append(
+                    (_create_op_desc_("sum", {"X": renamed_vars[var_name]},
+                                      {"Out": [var_name]}, {}), idx))
+                renamed_vars[var_name] = [var_name]
+        for var_name in op_desc.output_arg_names():
+            if var_name == core.empty_var_name(
+            ) or var_name in op_desc.input_arg_names():
+                # empty variable or inplace op
+                continue
+            if len(renamed_vars[var_name]) == 0:
+                # it's the first time we get the variable
+                renamed_vars[var_name] = [var_name]
+            else:
+                if len(renamed_vars[var_name]) == 1:
+                    new_name = var_name + "@RENAME@" + \
+                        str(var_rename_count[var_name])
+                    var_rename_count[var_name] += 1
+                    # rename original var_name
+                    renamed_vars[var_name][0] = new_name
+                    _rename_arg_(op_descs, var_name, new_name, 0, idx)
+                    _rename_arg_(pending_sum_ops, var_name, new_name)
+
+                new_name = var_name + "@RENAME@" + \
+                    str(var_rename_count[var_name])
+                var_rename_count[var_name] += 1
+                op_desc.rename_output(var_name, new_name)
+                renamed_vars[var_name].append(new_name)
+    for var_name, inputs in renamed_vars.iteritems():
+        if len(inputs) > 1:
+            pending_sum_ops.append((_create_op_desc_(
+                "sum", {"X": inputs}, {"Out": [var_name]}, {}), len(op_descs)))
+    # sum_op descs are sorted according to their insert position
+    for p in reversed(pending_sum_ops):
+        op_descs.insert(p[1], p[0])
+
+    return op_descs
+
+
+def _remove_no_grad_branch_(op_descs, no_grad_set):
+    # Remove ops whose outputs are all in no_grad_dict
+    op_descs = filter(
+        lambda op_desc: not _all_in_set_(op_desc.output_arg_names(), no_grad_set),
+        op_descs)
+    # Insert fill_zeros_like_op
+    to_insert = []
+    for idx, op_desc in enumerate(op_descs):
+        for arg in op_desc.input_arg_names():
+            if core.grad_var_suffix() in arg and arg in no_grad_set:
+                to_insert.append((_create_op_desc_("fill_zeros_like", {
+                    "X": [_strip_grad_suffix_(arg)]
+                }, {"Y": [arg]}, {}), idx))
+
+    map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert))
+
+    return op_descs
+
+
+def _append_backward_ops_(target,
+                          block,
+                          target_block,
+                          no_grad_dict,
+                          grad_to_var,
+                          callback=None):
+    grad_op_descs = []
+    program = block.program
+    for op in reversed(block.ops):
+        grad_sub_block_list = []
+        # If the op has its own sub-block, deal with the sub-block first
+        if op.has_attr("sub_block"):
+            sub_block = program.block(op.block_attr("sub_block"))
+            grad_sub_block = program.create_block(parent_idx=sub_block.idx)
+            _append_backward_ops_(target, sub_block, grad_sub_block,
+                                  no_grad_dict, grad_to_var, callback)
+            grad_sub_block_list.append(grad_sub_block.desc)
+
+        grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
+            op.desc, no_grad_dict[block.idx], grad_sub_block_list)
+        grad_op_descs.extend(grad_op_desc)
+        grad_to_var.update(op_grad_to_var)
+
+    grad_op_descs = _addup_repetitive_outputs_(grad_op_descs)
+
+    grad_op_descs = _remove_no_grad_branch_(grad_op_descs,
+                                            no_grad_dict[block.idx])
+
+    if target_block.idx == 0:
+        grad_op_descs.insert(
+            0,
+            _create_op_desc_("fill_constant", {}, {
+                "Out": [_append_grad_suffix_(target.name)]
+            }, {"shape": [1],
+                "value": 1.0,
+                "dtype": target.dtype}))
+    # append op_desc in grad_op_descs to target_block
+    for op_desc in grad_op_descs:
+        new_op_desc = target_block.desc.append_op()
+        new_op_desc.copy_from(op_desc)
+
+
+def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
+    for op_idx in range(start_op_idx, block.desc.op_size()):
+        op_desc = block.desc.op(op_idx)
+        if op_desc.has_attr("sub_block"):
+            sub_block = block.program.block(op_desc.block_attr("sub_block"))
+            _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
+        new_vars = set()
+        # create new gradient variables
+        for grad_var_name in op_desc.output_arg_names():
+            grad_var_name = grad_var_name.encode("ascii")
+            if block.desc.has_var_recursive(
+                    grad_var_name) or grad_var_name == core.empty_var_name():
+                continue
+            block.desc.var(grad_var_name)
+            new_vars.add(grad_var_name)
+            if not grad_to_var.has_key(grad_var_name):
+                continue
+            grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block)
+        # infer_shape and infer_type
+        op_desc.infer_var_type(block.desc)
+        op_desc.infer_shape(block.desc)
+        for arg in op_desc.output_arg_names():
+            if arg in new_vars:
+                _infer_var_data_type_(arg, block)
+
+
+def append_backward(loss, parameter_list=None, no_grad_set=None):
     """
     Create and add gradient Operators in BlockDesc to compute
     gradients of `loss` for parameters in parameter_list
 
     :param loss: an variable generated by cost function.
     :type loss: Variable
-    :param no_grad_set: variable that should not create gradient
-    :type no_grad_set: set
+    :param no_grad_dict: variable that should not create gradient
+    :type no_grad_dict: set
     :param parameter_list: parameters that need to compute gradient and 
     update to optimize the lost.
     :type: list
@@ -20,35 +212,53 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None):
     """
     assert isinstance(loss, framework.Variable)
 
+    program = loss.block.program
+    no_grad_dict = dict()
     if no_grad_set is None:
-        program = loss.block.program
         assert isinstance(program, framework.Program)
-        no_grad_set = list()
         for block in program.blocks:
             assert isinstance(block, framework.Block)
+            block_no_grad_set = set()
             for var in block.vars.itervalues():
                 assert isinstance(var, framework.Variable)
                 if var.stop_gradient:
-                    no_grad_set.append(var.name)
-        no_grad_set = set(no_grad_set)
+                    block_no_grad_set.add(_append_grad_suffix_(var.name))
+            no_grad_dict[block.idx] = block_no_grad_set
+    elif isinstance(no_grad_set, set):
+        no_grad_dict = {0: no_grad_set}
+    else:
+        raise ValueError("'no_grad_set' should be a set or None.")
+
+    grad_info_map = dict()
+    root_block = program.block(0)
+
+    fwd_op_num = root_block.desc.op_size()
+    current_block_idx = program.current_block_idx
+    grad_to_var = dict()
+
+    _append_backward_ops_(loss, root_block, root_block, no_grad_dict,
+                          grad_to_var)
+    _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)
+
+    program.current_block_idx = current_block_idx
+    program.sync_with_cpp()
 
-    param_grad_map = loss.block.program.append_backward(loss, no_grad_set)
     if parameter_list is not None:
         parameters = parameter_list
     else:
-        params = loss.block.program.global_block().all_parameters()
+        params = program.global_block().all_parameters()
         parameters = [param.name for param in params]
     params_and_grads = []
     for param in parameters:
-        if param not in param_grad_map:
+        if param not in grad_info_map:
             raise ValueError("param %s is not in map" % param)
-        grad_info = param_grad_map[param]
-        grad_block = loss.block.program.block(grad_info[1])
+        grad_info = grad_info_map[param]
+        grad_block = grad_info[1]
         if not grad_block.has_var(grad_info[0]):
             raise ValueError("grad block[{0}] did not have grad var {1}".format(
                 grad_info[1], grad_info[0]))
         # Get the param var from the global block
-        param_var = loss.block.program.global_block().var(param)
+        param_var = program.global_block().var(param)
         grad_var = grad_block.var(grad_info[0])
         if loss.block.has_var(grad_info[0]):
             params_and_grads.append((param_var, grad_var))
diff --git a/python/paddle/v2/fluid/clip.py b/python/paddle/v2/fluid/clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7ec2fbe13fe6d9158345099b8668afc5c7d4571
--- /dev/null
+++ b/python/paddle/v2/fluid/clip.py
@@ -0,0 +1,61 @@
+import functools
+import layers
+
+__all__ = ['GradientClipByValue', 'append_gradient_clip_ops']
+
+
+class BaseGradientClipAttr(object):
+    def process_context(self, context, p_g):
+        raise NotImplementedError()
+
+    def create_operators(self, param, grad):
+        raise NotImplementedError()
+
+
+class NullGradientClipAttr(BaseGradientClipAttr):
+    def process_context(self, context, p_g):
+        pass
+
+    def create_operators(self, param, grad):
+        return param, grad
+
+
+class GradientClipByValue(BaseGradientClipAttr):
+    def __init__(self, max, min=None):
+        max = float(max)
+        if min is None:
+            min = -max
+        else:
+            min = float(min)
+        self.max = max
+        self.min = min
+
+    def process_context(self, context, p_g):
+        pass
+
+    def create_operators(self, param, grad):
+        new_grad = layers.clip(x=grad, min=self.min, max=self.max)
+        return param, new_grad
+
+
+def append_gradient_clip_ops(param_grad):
+    context = dict()
+    create_op_callbacks = []
+    for p, g in param_grad:
+        clip_attr = getattr(p, 'clip_attr', NullGradientClipAttr())
+        if clip_attr is None:
+            clip_attr = NullGradientClipAttr()
+        if not isinstance(clip_attr, BaseGradientClipAttr):
+            raise TypeError(
+                "clip attribute should be an instance of BaseGradientClippingAttr"
+            )
+
+        clip_attr.process_context(context=context, p_g=param_grad)
+        create_op_callbacks.append(
+            functools.partial(
+                clip_attr.create_operators, param=p, grad=g))
+
+    return [each_callback() for each_callback in create_op_callbacks]
+
+
+ClipByValue = GradientClipByValue
diff --git a/python/paddle/v2/fluid/data_feeder.py b/python/paddle/v2/fluid/data_feeder.py
index 30a542af212926c93381aade426e25f2117e4662..24036c3e75b9594ba58cccb02825ab8020d1e107 100644
--- a/python/paddle/v2/fluid/data_feeder.py
+++ b/python/paddle/v2/fluid/data_feeder.py
@@ -3,7 +3,7 @@ import core
 import numpy
 import six.moves as six
 
-from framework import Variable
+from framework import Variable, default_main_program
 
 __all__ = ['DataFeeder']
 
@@ -53,12 +53,16 @@ class DataToLoDTensorConverter(object):
 
 
 class DataFeeder(object):
-    def __init__(self, feed_list, place):
+    def __init__(self, feed_list, place, program=None):
         self.feed_dtypes = []
         self.feed_names = []
         self.feed_shapes = []
         self.feed_lod_level = []
+        if program is None:
+            program = default_main_program()
         for each_var in feed_list:
+            if isinstance(each_var, basestring):
+                each_var = program.block(0).var(each_var)
             if not isinstance(each_var, Variable):
                 raise TypeError("Feed list should contain a list of variable")
             self.feed_dtypes.append(each_var.dtype)
diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..49ece7b725e318d7526d58fe54c97cbe20200a7d
--- /dev/null
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -0,0 +1,242 @@
+import framework
+from framework import Program, default_main_program, Parameter, Variable
+import optimizer
+from layer_helper import LayerHelper
+
+
+def hash_name_to_server(params_grads, pserver_endpoints):
+    """
+    :param param_grads:
+    :return: a map of pserver endpoint -> 
+                    params -> [param list]
+                    grads  -> [grad list]
+    """
+
+    def _hash_param(param_name, total):
+        return hash(param_name) % total
+
+    param_grad_map = dict()
+    for param, grad in params_grads:
+        if param.trainable is True and grad is not None:
+            server_id = _hash_param(param.name, len(pserver_endpoints))
+            server_for_param = pserver_endpoints[server_id]
+            if not param_grad_map.has_key(server_for_param):
+                param_grad_map[server_for_param] = {"params": [], "grads": []}
+            param_grad_map[server_for_param]["params"].append(param)
+            param_grad_map[server_for_param]["grads"].append(grad)
+
+    return param_grad_map
+
+
+def round_robin(params_grads, pserver_endpoints):
+    assert (len(params_grads) > len(pserver_endpoints))
+
+    param_grad_map = dict()
+    pserver_idx = 0
+    for param, grad in params_grads:
+        if param.trainable is True:
+            server_for_param = pserver_endpoints[pserver_idx]
+            if not param_grad_map.has_key(server_for_param):
+                param_grad_map[server_for_param] = {"params": [], "grads": []}
+
+            param_grad_map[server_for_param]["params"].append(param)
+            param_grad_map[server_for_param]["grads"].append(grad)
+
+            pserver_idx += 1
+            if pserver_idx >= len(pserver_endpoints):
+                pserver_idx = 0
+    return param_grad_map
+
+
+class DistributeTranspiler:
+    def transpile(self,
+                  optimize_ops,
+                  params_grads,
+                  program=None,
+                  pservers="127.0.0.1:6174",
+                  trainers=1,
+                  split_method=round_robin):
+        """
+            Transpile the program to a distributed data-parallelism programs.
+
+            The main_program will be transform to use a remote parameter server
+            to do parameter optimization. And the optimization graph will be put
+            in to a parameter server program.
+
+            Use different methods to split trainable varialbles to different
+            parameter servers.
+
+            Example to run:
+
+            exe = fluid.Executor(place)
+            t = fluid.DistributeTranspiler()
+            t.transpile(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1)
+
+            pserver_endpoint = os.getenv("PSERVER")
+            if pserver_endpoint:
+                pserver_prog = t.get_pserver_program(pserver_endpoint, optimize_ops)
+                exe.run(fluid.default_startup_program())
+                exe.run(pserver_prog)
+            else:
+                feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+                exe.run(fluid.default_startup_program())
+
+                for pass_id in range(PASS_NUM):
+                    ...
+
+            :param optimize_ops: op list of optimization, should be the
+                                 return value of Optimizer.minimize
+            :type optimize_ops: list
+            :param program: program to optimize, default default_main_program
+            :param pservers: parameter server endpoints like "m1:6174,m2:6174"
+            :type pservers: string
+
+            :return: return a list of programs
+        """
+        if program is None:
+            program = default_main_program()
+        self.program = program
+        self.trainers = trainers
+        self.optimize_ops = optimize_ops
+        self._optimize_distributed(
+            optimize_ops,
+            program,
+            params_grads,
+            pservers=pservers,
+            trainers=trainers,
+            split_method=split_method)
+
+    def _clone_param(self, block, v):
+        assert isinstance(v, Parameter)
+        new_p = Parameter(
+            block=block,
+            shape=v.shape,
+            dtype=v.dtype,
+            type=v.type,
+            lod_level=v.lod_level,
+            stop_gradient=v.stop_gradient,
+            trainable=v.trainable,
+            optimize_attr=v.optimize_attr,
+            regularizer=v.regularizer,
+            name=v.name)
+        block.vars[new_p.name] = new_p
+
+    def _clone_var(self, block, var):
+        assert isinstance(var, Variable)
+        return block.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            lod_level=var.lod_level,
+            persistable=var.persistable)
+
+    def _optimize_distributed(self, optimize_ops, program, params_and_grads,
+                              **kwargs):
+        if kwargs.has_key("split_method"):
+            split_method = kwargs["split_method"]
+        else:
+            split_method = round_robin
+
+        assert (callable(split_method))
+        pserver_endpoints = kwargs["pservers"].split(",")
+        self.param_grad_map = split_method(params_and_grads, pserver_endpoints)
+
+        send_op_ordered_inputs = []
+        send_op_ordered_outputs = []
+        epmap = []
+        for ep, v in self.param_grad_map.iteritems():
+            send_op_ordered_inputs.extend(v["grads"])
+            send_op_ordered_outputs.extend(v["params"])
+            for i in v["grads"]:
+                epmap.append(ep)
+        send_op = program.global_block().append_op(
+            type="send",
+            inputs={"X": send_op_ordered_inputs
+                    },  # inputs is a list of tensors to be send
+            outputs={"Out": send_op_ordered_outputs},
+            attrs={"endpoints": pserver_endpoints,
+                   "epmap": epmap})
+
+    def get_trainer_program(self):
+        # remove optimize ops and add a send op to main_program
+        self.program.global_block().delete_ops(self.optimize_ops)
+        return self.program
+
+    def _create_var_for_trainers(self, block, var, trainers):
+        var_list = []
+        for i in xrange(trainers):
+            var_each = block.create_var(
+                name="%s.trainer_%d" % (var.name, i),
+                psersistable=var.persistable,
+                dtype=var.dtype,
+                shape=var.shape)
+            var_list.append(var_each)
+        return var_list
+
+    def get_pserver_program(self, endpoint, optimize_ops):
+        pserver_program = Program()
+        for v in self.param_grad_map[endpoint]["params"]:
+            self._clone_param(pserver_program.global_block(), v)
+
+        optimize_sub_program = Program()
+        grad_var_names = [
+            var.name for var in self.param_grad_map[endpoint]["grads"]
+        ]
+        for opt_op in optimize_ops:
+            for _, var in opt_op.inputs.iteritems():
+                # NOTE: append operators to merge gradients from multiple
+                # trainers. If trainers == 1, this is not needed.
+                if self.trainers > 1 and var.name in grad_var_names:
+                    vars2merge = self._create_var_for_trainers(
+                        optimize_sub_program.global_block(), var, self.trainers)
+                    merged_var = optimize_sub_program.global_block().create_var(
+                        name=var.name,
+                        persistable=var.persistable,
+                        dtype=var.dtype,
+                        shape=var.shape)
+                    optimize_sub_program.global_block().append_op(
+                        type="sum",
+                        inputs={"X": vars2merge},
+                        outputs={"Out": merged_var})
+                    optimize_sub_program.global_block().append_op(
+                        type="scale",
+                        inputs={"X": merged_var},
+                        outputs={"Out": merged_var},
+                        attrs={"scale": 1.0 / float(self.trainers)})
+                else:
+                    optimize_sub_program.global_block().create_var(
+                        name=var.name,
+                        persistable=var.persistable,
+                        dtype=var.dtype,
+                        shape=var.shape)
+
+            if opt_op.inputs.has_key("Grad"):
+                if opt_op.inputs["Grad"].name in grad_var_names:
+                    optimize_sub_program.global_block().append_op(
+                        type=opt_op.type,
+                        inputs=opt_op.inputs,
+                        outputs=opt_op.outputs,
+                        attrs=opt_op.attrs)
+            else:
+                optimize_sub_program.global_block().append_op(
+                    type=opt_op.type,
+                    inputs=opt_op.inputs,
+                    outputs=opt_op.outputs,
+                    attrs=opt_op.attrs)
+        pserver_program.global_block().append_op(
+            type="recv",
+            inputs={"RX":
+                    self.param_grad_map[endpoint]["grads"]},  # grads to recv
+            outputs={},
+            attrs={
+                "OptimizeProgram": optimize_sub_program.desc,
+                "endpoint": endpoint,
+                "ParamList":
+                [p.name for p in self.param_grad_map[endpoint]["params"]],
+                "GradList":
+                [p.name for p in self.param_grad_map[endpoint]["grads"]],
+                "Trainers": self.trainers
+            })
+        pserver_program.sync_with_cpp()
+        return pserver_program
diff --git a/python/paddle/v2/fluid/evaluator.py b/python/paddle/v2/fluid/evaluator.py
index 137c5736226b689340748d5098ca51659d5acff8..e186ee96c387acf24471d4e26ce020c4ecac8d19 100644
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -1,10 +1,10 @@
 import numpy as np
 
 import layers
-from framework import Program, unique_name, Variable
+from framework import Program, unique_name, Variable, program_guard
 from layer_helper import LayerHelper
 
-__all__ = ['Accuracy']
+__all__ = ['Accuracy', 'ChunkEvaluator']
 
 
 def _clone_var_(block, var):
@@ -49,15 +49,12 @@ class Evaluator(object):
         if reset_program is None:
             reset_program = Program()
 
-        for var in self.states:
-            assert isinstance(var, Variable)
-            g_var = _clone_var_(reset_program.current_block(), var)
-            layers.fill_constant(
-                shape=g_var.shape,
-                value=0.0,
-                dtype=g_var.dtype,
-                out=g_var,
-                main_program=reset_program)
+        with program_guard(main_program=reset_program):
+            for var in self.states:
+                assert isinstance(var, Variable)
+                g_var = _clone_var_(reset_program.current_block(), var)
+                layers.fill_constant(
+                    shape=g_var.shape, value=0.0, dtype=g_var.dtype, out=g_var)
 
         executor.run(reset_program)
 
@@ -104,20 +101,14 @@ class Accuracy(Evaluator):
         self.total = self.create_state(dtype='int64', shape=[1], suffix='total')
         self.correct = self.create_state(
             dtype='int64', shape=[1], suffix='correct')
-        kwargs = {'main_program': main_program}
         total = self.helper.create_tmp_variable(dtype='int')
         correct = self.helper.create_tmp_variable(dtype='int')
         acc = layers.accuracy(
-            input=input,
-            label=label,
-            k=k,
-            total=total,
-            correct=correct,
-            **kwargs)
-        total = layers.cast(x=total, dtype='int64', **kwargs)
-        correct = layers.cast(x=correct, dtype='int64', **kwargs)
-        layers.sums(input=[self.total, total], out=self.total, **kwargs)
-        layers.sums(input=[self.correct, correct], out=self.correct, **kwargs)
+            input=input, label=label, k=k, total=total, correct=correct)
+        total = layers.cast(x=total, dtype='int64')
+        correct = layers.cast(x=correct, dtype='int64')
+        layers.sums(input=[self.total, total], out=self.total)
+        layers.sums(input=[self.correct, correct], out=self.correct)
 
         self.metrics.append(acc)
 
@@ -125,10 +116,75 @@ class Accuracy(Evaluator):
         if eval_program is None:
             eval_program = Program()
         block = eval_program.current_block()
-        kwargs = {'main_program': eval_program}
-        total = _clone_var_(block, self.total)
-        correct = _clone_var_(block, self.correct)
-        total = layers.cast(total, dtype='float32', **kwargs)
-        correct = layers.cast(correct, dtype='float32', **kwargs)
-        out = layers.elementwise_div(x=correct, y=total, **kwargs)
+        with program_guard(main_program=eval_program):
+            total = _clone_var_(block, self.total)
+            correct = _clone_var_(block, self.correct)
+            total = layers.cast(total, dtype='float32')
+            correct = layers.cast(correct, dtype='float32')
+            out = layers.elementwise_div(x=correct, y=total)
         return np.array(executor.run(eval_program, fetch_list=[out])[0])
+
+
+class ChunkEvaluator(Evaluator):
+    """
+    Accumulate counter numbers output by chunk_eval from mini-batches and 
+    compute the precision recall and F1-score using the accumulated counter 
+    numbers.
+    """
+
+    def __init__(
+            self,
+            input,
+            label,
+            chunk_scheme,
+            num_chunk_types,
+            excluded_chunk_types=None, ):
+        super(ChunkEvaluator, self).__init__("chunk_eval")
+        main_program = self.helper.main_program
+        if main_program.current_block().idx != 0:
+            raise ValueError("You can only invoke Evaluator in root block")
+
+        self.num_infer_chunks = self.create_state(
+            dtype='int64', shape=[1], suffix='num_infer_chunks')
+        self.num_label_chunks = self.create_state(
+            dtype='int64', shape=[1], suffix='num_label_chunks')
+        self.num_correct_chunks = self.create_state(
+            dtype='int64', shape=[1], suffix='num_correct_chunks')
+        precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval(
+            input=input,
+            label=label,
+            chunk_scheme=chunk_scheme,
+            num_chunk_types=num_chunk_types,
+            excluded_chunk_types=excluded_chunk_types, )
+        layers.sums(
+            input=[self.num_infer_chunks, num_infer_chunks],
+            out=self.num_infer_chunks)
+        layers.sums(
+            input=[self.num_label_chunks, num_label_chunks],
+            out=self.num_label_chunks)
+        layers.sums(
+            input=[self.num_correct_chunks, num_correct_chunks],
+            out=self.num_correct_chunks)
+
+        self.metrics.extend([precision, recall, f1_score])
+
+    def eval(self, executor, eval_program=None):
+        if eval_program is None:
+            eval_program = Program()
+        block = eval_program.current_block()
+        num_infer_chunks, num_label_chunks, num_correct_chunks = executor.run(
+            eval_program,
+            fetch_list=[_clone_var_(block, state) for state in self.states])
+        num_infer_chunks = num_infer_chunks[0]
+        num_label_chunks = num_label_chunks[0]
+        num_correct_chunks = num_correct_chunks[0]
+        precision = float(
+            num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0
+        recall = float(
+            num_correct_chunks) / num_label_chunks if num_label_chunks else 0
+        f1_score = float(2 * precision * recall) / (
+            precision + recall) if num_correct_chunks else 0
+        return np.array(
+            [precision], dtype='float32'), np.array(
+                [recall], dtype='float32'), np.array(
+                    [f1_score], dtype='float32')
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index bdc82eede9d93a7cf904999a6b869ce2d23c90dc..2c91afb363bf72f2791e60c6df0d9130ccd698c5 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -1,6 +1,6 @@
 import numpy as np
 from . import core
-from framework import Program, default_main_program
+from framework import Program, default_main_program, Parameter, Variable
 
 __all__ = ['Executor', 'g_scope']
 
@@ -46,7 +46,15 @@ class Executor(object):
             p.set_place(each)
             act_places.append(p)
 
-        self.executor = core.Executor(act_places)
+        # TODO(dzhwinter) : consider that our fluid tests all written in 
+        # CUDAPlace(gpu_id), this will be changed in the future
+        if core.is_compile_gpu():
+            core.init_devices(["CPU", "GPU:0"])
+        else:
+            core.init_devices(["CPU"])
+
+        # TODO(dzhwinter) : only use the first place
+        self.executor = core.Executor(act_places[0])
         self.places = places
 
     def aslodtensor(self, data):
@@ -141,7 +149,7 @@ class Executor(object):
                 outputs={'Out': [fetch_var]},
                 attrs={'col': i})
 
-        self.executor.run(program.desc, scope, 0, True)
+        self.executor.run(program.desc, scope, 0, True, True)
         outs = [
             core.get_fetch_variable(scope, fetch_var_name, i)
             for i in xrange(len(fetch_list))
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index bf0cd275b62ae2c4d7312592b8a730291c59a071..b66a8bce5f4f15539007876c113afd3f878b00bc 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -1,10 +1,10 @@
 import collections
+import contextlib
 
 import numpy as np
-from . import core
+
 import proto.framework_pb2 as framework_pb2
-import google.protobuf.message
-import contextlib
+from . import core
 
 __all__ = [
     'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
@@ -12,6 +12,22 @@ __all__ = [
     'switch_main_program'
 ]
 
+EMPTY_VAR_NAME = core.kEmptyVarName()
+TEMP_VAR_NAME = core.kTempVarName()
+GRAD_VAR_SUFFIX = core.kGradVarSuffix()
+ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
+
+USE_CPU = core.kUseCPU()
+USE_CUDNN = core.kUseMKLDNN()
+USE_MKLDNN = core.kUseMKLDNN()
+
+
+def grad_var_name(var_name):
+    """
+    return gradient name for a certain var name
+    """
+    return var_name + GRAD_VAR_SUFFIX
+
 
 def unique_name(prefix):
     """
@@ -347,6 +363,10 @@ class Operator(object):
         """
         self.block = block
         self.desc = desc
+        # for clone a new operator
+        self.inputs = inputs
+        self.outputs = outputs
+        self.attrs = attrs
         if len(self.desc.type()) != 0:
             return
         if type is None:
@@ -377,7 +397,10 @@ class Operator(object):
                             % (in_proto.name, len(in_args)))
                     in_arg_names = []
                     for arg in in_args:
-                        in_arg_names.append(arg.name)
+                        if isinstance(arg, basestring):
+                            in_arg_names.append(arg)
+                        else:
+                            in_arg_names.append(arg.name)
                     self.desc.set_input(in_proto.name, in_arg_names)
                 else:
                     self.desc.set_input(in_proto.name, [])
@@ -418,13 +441,18 @@ class Operator(object):
                     continue
                 if isinstance(attrs[attr_name], Block):
                     self.desc.set_block_attr(attr_name, attrs[attr_name].desc)
+                elif isinstance(attrs[attr_name], core.BlockDesc) or \
+                   isinstance(attrs[attr_name], core.ProgramDesc):
+                    self.desc.set_serialized_attr(
+                        attr_name, attrs[attr_name].serialize_to_string())
                 else:
                     self.desc.set_attr(attr_name, attrs[attr_name])
 
         self.desc.check_attrs()
         no_kernel_op_set = {
             'feed', 'fetch', 'save', 'load', 'recurrent',
-            'rnn_memory_helper_grad', 'conditional_block', 'while'
+            'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
+            'recv'
         }
         if type not in no_kernel_op_set:
             self.desc.infer_var_type(self.block.desc)
@@ -570,6 +598,7 @@ class Block(object):
         self.vars = dict()  # var_name --> var
         self.ops = collections.deque()  # operator list
         self.program = program
+        self.removed_vars = dict()
 
     def __str__(self):
         return self.to_string(True)
@@ -626,6 +655,16 @@ class Block(object):
         self.ops.append(op)
         return op
 
+    def delete_ops(self, ops):
+        # remove from cpp
+        # FIXME(typhoonzero): remove only the first occuracy.
+        try:
+            start = list(self.ops).index(ops[0])
+            end = list(self.ops).index(ops[-1])
+        except Exception, e:
+            raise e
+        self.desc.remove_op(start, end + 1)
+
     def prepend_op(self, *args, **kwargs):
         op_desc = self.desc.prepend_op()
         op = Operator(self, op_desc, *args, **kwargs)
@@ -704,6 +743,7 @@ class Block(object):
                 trainable=p.trainable,
                 optimize_attr=p.optimize_attr,
                 regularizer=p.regularizer,
+                clip_attr=p.clip_attr,
                 name=v.name)
             self.vars[new_p.name] = new_p
 
@@ -806,9 +846,11 @@ class Program(object):
         self.sync_with_cpp()
         return param_to_grad_info
 
-    def create_block(self):
+    def create_block(self, parent_idx=None):
         new_block_idx = len(self.blocks)
-        self.desc.append_block(self.current_block().desc)
+        parent = self.current_block() if parent_idx is None else self.block(
+            parent_idx)
+        self.desc.append_block(parent.desc)
         self.current_block_idx = new_block_idx
         self.blocks.append(Block(self, self.current_block_idx))
         return self.current_block()
@@ -866,6 +908,8 @@ class Parameter(Variable):
 
         self.regularizer = kwargs.get('regularizer', None)
 
+        self.clip_attr = kwargs.get('clip_attr', None)
+
 
 # program is a global instance.
 _main_program_ = Program()
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py
index e147ac22ad289eb00c83def66974d875fcdc31f8..c47ce82aba7fa5ac42ac26cd25fa3ebc93e96cb2 100644
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -180,10 +180,22 @@ def save_inference_model(dirname,
 
     :return: None
     """
+    if isinstance(feeded_var_names, basestring):
+        feeded_var_names = [feeded_var_names]
+    else:
+        if not (bool(feeded_var_names) and all(
+                isinstance(name, basestring) for name in feeded_var_names)):
+            raise ValueError("'feed_var_names' should be a list of str.")
+
+    if isinstance(target_vars, Variable):
+        target_vars = [target_vars]
+    else:
+        if not (bool(target_vars) and all(
+                isinstance(var, Variable) for var in target_vars)):
+            raise ValueError("'target_vars' should be a list of Variable.")
+
     if main_program is None:
         main_program = default_main_program()
-    if not isinstance(target_vars, list):
-        target_vars = [target_vars]
 
     if not os.path.isdir(dirname):
         os.makedirs(dirname)
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
index 3963e1322230259230885c097d37b818edda6b13..4469f7285efe1c31d0955c6dd4ba3ecac08070af 100644
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -21,19 +21,11 @@ class LayerHelper(object):
 
     @property
     def main_program(self):
-        prog = self.kwargs.get('main_program', None)
-        if prog is None:
-            return default_main_program()
-        else:
-            return prog
+        return default_main_program()
 
     @property
     def startup_program(self):
-        prog = self.kwargs.get('startup_program', None)
-        if prog is None:
-            return default_startup_program()
-        else:
-            return prog
+        return default_startup_program()
 
     def append_op(self, *args, **kwargs):
         return self.main_program.current_block().append_op(*args, **kwargs)
@@ -151,13 +143,6 @@ class LayerHelper(object):
             persistable=True,
             initializer=initializer)
 
-    @property
-    def to_kwargs(self):
-        return {
-            'main_program': self.main_program,
-            'startup_program': self.startup_program
-        }
-
     def append_bias_op(self, input_var, dim_start=1, dim_end=None):
         """
         Append bias operator and return its output. If the user does not set
@@ -199,7 +184,7 @@ class LayerHelper(object):
         self.append_op(
             type=act_type,
             inputs={"X": [input_var]},
-            outputs={"Y": [tmp]},
+            outputs={"Out": [tmp]},
             attrs=act)
         return tmp
 
@@ -209,3 +194,9 @@ class LayerHelper(object):
         else:
             # For integer and boolean types, initialize with all zeros
             return Constant()
+
+    def is_instance(self, param_name, cls):
+        param = self.kwargs.get(param_name, None)
+        if not isinstance(param, cls):
+            raise TypeError("The input {0} parameter of method {1} must be {2}",
+                            param_name, self.layer_type, cls.__name__)
diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py
index 5af6c789773fe80ceed99c69a419f18cf2db8d37..22a37c22c3fc777cadcdee6632bbf1fb558fef70 100644
--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -3,6 +3,7 @@ from ..framework import Program, Variable, Operator
 from .. import core
 from tensor import assign, fill_constant
 import contextlib
+from ..registry import autodoc
 
 __all__ = [
     'split_lod_tensor', 'merge_lod_tensor', 'BlockGuard', 'StaticRNNGuard',
@@ -10,15 +11,11 @@ __all__ = [
     'max_sequence_len', 'topk', 'lod_tensor_to_array', 'array_to_lod_tensor',
     'increment', 'array_write', 'create_array', 'less_than', 'array_read',
     'shrink_memory', 'array_length', 'IfElse', 'DynamicRNN', 'ConditionalBlock',
-    'StaticRNN'
+    'StaticRNN', 'reorder_lod_tensor_by_rank'
 ]
 
 
-def split_lod_tensor(input,
-                     mask,
-                     level=0,
-                     main_program=None,
-                     startup_program=None):
+def split_lod_tensor(input, mask, level=0):
     helper = LayerHelper('split_lod_tensor', **locals())
     out_true = helper.create_tmp_variable(dtype=input.dtype)
     out_false = helper.create_tmp_variable(dtype=input.dtype)
@@ -34,13 +31,7 @@ def split_lod_tensor(input,
     return out_true, out_false
 
 
-def merge_lod_tensor(in_true,
-                     in_false,
-                     x,
-                     mask,
-                     level=0,
-                     main_program=None,
-                     startup_program=None):
+def merge_lod_tensor(in_true, in_false, x, mask, level=0):
     helper = LayerHelper('merge_lod_tensor', **locals())
     out = helper.create_tmp_variable(dtype=in_true.dtype)
     helper.append_op(
@@ -135,9 +126,8 @@ class StaticRNN(object):
     IN_RNN_BLOCK = 1
     AFTER_RNN_BLOCK = 2
 
-    def __init__(self, name=None, main_program=None):
-        self.helper = LayerHelper(
-            "static_rnn", name=name, main_program=main_program)
+    def __init__(self, name=None):
+        self.helper = LayerHelper("static_rnn", name=name)
         self.memories = {}  # memory map, from pre_mem.name --> MemoryLink
         self.inputs = []  # input variable list in current block
         self.outputs = []  # output variable list in parent block
@@ -354,8 +344,8 @@ class While(object):
     IN_WHILE_BLOCK = 1
     AFTER_WHILE_BLOCK = 2
 
-    def __init__(self, cond, name=None, main_program=None):
-        self.helper = LayerHelper("while", name=name, main_program=main_program)
+    def __init__(self, cond, name=None):
+        self.helper = LayerHelper("while", name=name)
         self.status = While.BEFORE_WHILE_BLOCK
         if not isinstance(cond, Variable):
             raise TypeError("condition should be a variable")
@@ -406,7 +396,7 @@ class While(object):
             attrs={'sub_block': while_block})
 
 
-def lod_rank_table(x, level=0, main_program=None):
+def lod_rank_table(x, level=0):
     """
     This function creates an operator for creating a LOD_RANK_TABLE
     using the input x.
@@ -423,7 +413,7 @@ def lod_rank_table(x, level=0, main_program=None):
     return table
 
 
-def max_sequence_len(rank_table, main_program=None):
+def max_sequence_len(rank_table):
     """
     This function creates an operator to calculate the length of
     max seqence through input rank_table(should be a lod_rank_table)
@@ -437,7 +427,7 @@ def max_sequence_len(rank_table, main_program=None):
     return res
 
 
-def topk(input, k, main_program=None, startup_program=None):
+def topk(input, k):
     helper = LayerHelper('topk', **locals())
     topk_out = helper.create_tmp_variable(dtype=input.data_type)
     topk_indices = helper.create_tmp_variable(dtype='int64')
@@ -450,10 +440,26 @@ def topk(input, k, main_program=None, startup_program=None):
     return topk_out, topk_indices
 
 
-def lod_tensor_to_array(x, table, main_program=None):
-    """
-    This function creates an operator to convert an LOD_Tensor to
-    an array.
+def lod_tensor_to_array(x, table):
+    """This function performs the operation that converts an LOD_Tensor to
+       an array.
+
+    Args:
+        x (Variable|list): The tensor that needs to be converted to an array.
+        table (ParamAttr|list): The variable that stores the level of lod
+                                which is ordered by sequence length in
+                                descending order.
+
+    Returns:
+        Variable: The variable of type array that has been converted from a
+                  tensor.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[10])
+          table = fluid.layers.lod_rank_table(x, level=0)
+          array = fluid.layers.lod_tensor_to_array(x, table)
     """
     helper = LayerHelper("lod_tensor_to_array", **locals())
     array = helper.create_variable(
@@ -468,10 +474,27 @@ def lod_tensor_to_array(x, table, main_program=None):
     return array
 
 
-def array_to_lod_tensor(x, table, main_program=None, startup_program=None):
-    """
-    This function creates an operator to convert an array to a
-    LOD_Tensor.
+def array_to_lod_tensor(x, table):
+    """This function performs the operations that converts an array to
+       an LOD_Tensor.
+
+    Args:
+        x (Variable|list): The array that needs to be converted to a tensor.
+        table (ParamAttr|list): The variable that stores the level of lod
+                                which is ordered by sequence length in
+                                descending order.
+
+    Returns:
+        Variable: The variable of type tensor that has been converted
+                  from an array.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[10])
+          table = fluid.layers.lod_rank_table(x, level=0)
+          array = fluid.layers.lod_tensor_to_array(x, table)
+          lod_tensor = fluid.layers.array_to_lod_tensor(array, table)
     """
     helper = LayerHelper("array_to_lod_tensor", **locals())
     tmp = helper.create_tmp_variable(dtype=x.dtype)
@@ -483,15 +506,25 @@ def array_to_lod_tensor(x, table, main_program=None, startup_program=None):
     return tmp
 
 
-def increment(x,
-              value=1.0,
-              in_place=True,
-              main_program=None,
-              startup_program=None):
-    """
-    This function creates an operator to increment each value in the input
-    `x` by an amount: `value` as mentioned in the input parameter. This
-    operation is performed in-place by default.
+def increment(x, value=1.0, in_place=True):
+    """This function performs an operation that increments each value in the
+    input :math:`x` by an amount: :math:`value` as mentioned in the input
+    parameter. This operation is performed in-place by default.
+
+    Args:
+        x (Variable|list): The tensor that has the input values.
+        value (float): The amount by which the values should be incremented.
+        in_place (bool): If the increment should be performed in-place.
+
+    Returns:
+        Variable: The tensor variable storing the transformation of
+                  element-wise increment of each value in the input.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32')
+          data = fluid.layers.increment(x=data, value=3.0, in_place=True)
     """
     helper = LayerHelper("increment", **locals())
     if not in_place:
@@ -506,10 +539,25 @@ def increment(x,
     return out
 
 
-def array_write(x, i, array=None, main_program=None, startup_program=None):
-    """
-    This function creates an operator to write the data out as a
+def array_write(x, i, array=None):
+    """This function performs the operation to write the data out as an
     LOD_TENSOR_ARRAY.
+
+    Args:
+        x (Variable|list): The input tensor from which the data will be read.
+        i (Variable|list): The subscript index in tensor array, that points the
+                           place from which data will be read.
+        array (Variable|list): The data can be read into this variable if
+                               this is assigned.
+    Returns:
+        Variable: The tensor type variable that has the data written to it.
+
+    Examples:
+        .. code-block::python
+
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = layers.array_write(tmp, i=i)
     """
     helper = LayerHelper('array_write', **locals())
     if array is None:
@@ -525,7 +573,22 @@ def array_write(x, i, array=None, main_program=None, startup_program=None):
     return array
 
 
-def create_array(dtype, main_program=None):
+def create_array(dtype):
+    """This function creates an array of type :math:`LOD_TENSOR_ARRAY` using the
+    LayerHelper.
+
+    Args:
+        dtype (int|float): The data type of the elements in the array.
+
+    Returns:
+        Variable: The tensor variable storing the elements of data type.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.create_array(dtype='float32')
+
+    """
     helper = LayerHelper("array", **locals())
     return helper.create_variable(
         name="{0}.out".format(helper.name),
@@ -533,7 +596,25 @@ def create_array(dtype, main_program=None):
         dtype=dtype)
 
 
-def less_than(x, y, cond=None, main_program=None, **ignored):
+def less_than(x, y, cond=None, **ignored):
+    """
+    **Less than**
+
+    This layer returns the truth value of :math:`x < y` elementwise.
+
+    Args:
+        x(Variable): First operand of *less_than*
+        y(Variable): Second operand of *less_than*
+        cond(Variable|None): Optional output variable to store the result of *less_than*
+
+    Returns:
+        Variable: The tensor variable storing the output of *less_than*.
+
+    Examples:
+        .. code-block:: python
+
+          less = fluid.layers.less_than(x=label, y=limit)
+    """
     helper = LayerHelper("less_than", **locals())
     if cond is None:
         cond = helper.create_tmp_variable(dtype='bool')
@@ -545,10 +626,20 @@ def less_than(x, y, cond=None, main_program=None, **ignored):
     return cond
 
 
-def array_read(array, i, main_program=None, startup_program=None):
-    """
-    This function creates an operator to read the data in as a
+def array_read(array, i):
+    """This function performs the operation to read the data in as an
     LOD_TENSOR_ARRAY.
+    Args:
+        array (Variable|list): The input tensor that will be written to an array.
+        i (Variable|list): The subscript index in tensor array, that points the
+                           place where data will be written to.
+    Returns:
+        Variable: The tensor type variable that has the data written to it.
+    Examples:
+        .. code-block::python
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = layers.array_read(tmp, i=i)
     """
     helper = LayerHelper('array_read', **locals())
     if not isinstance(
@@ -564,7 +655,7 @@ def array_read(array, i, main_program=None, startup_program=None):
     return out
 
 
-def shrink_memory(x, i, table, main_program=None, startup_program=None):
+def shrink_memory(x, i, table):
     """
     This function creates an operator to shrink_rnn_memory using the RankTable
     as mentioned in the input parameter.
@@ -581,10 +672,24 @@ def shrink_memory(x, i, table, main_program=None, startup_program=None):
     return out
 
 
-def array_length(array, main_program=None):
-    """
-    This function creates an operator to find the length of the
+def array_length(array):
+    """This function performs the operation to find the length of the input
     LOD_TENSOR_ARRAY.
+
+    Args:
+        array (LOD_TENSOR_ARRAY): The input array that will be used
+                                  to compute the length.
+
+    Returns:
+        Variable: The length of the input LoDTensorArray.
+
+    Examples:
+        .. code-block::python
+
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = fluid.layers.array_write(tmp, i=i)
+          arr_len = fluid.layers.array_length(arr)
     """
     helper = LayerHelper('array_length', **locals())
     tmp = helper.create_tmp_variable(dtype='int64')
@@ -611,20 +716,12 @@ class ConditionalBlockGuard(BlockGuard):
 
 
 class ConditionalBlock(object):
-    def __init__(self,
-                 inputs,
-                 name=None,
-                 main_program=None,
-                 startup_program=None):
+    def __init__(self, inputs, name=None):
         for each_input in inputs:
             if not isinstance(each_input, Variable):
                 raise TypeError("Each input should be variable")
         self.inputs = inputs
-        self.helper = LayerHelper(
-            'conditional_block',
-            name=name,
-            main_program=main_program,
-            startup_program=startup_program)
+        self.helper = LayerHelper('conditional_block', name=name)
 
     def block(self):
         return ConditionalBlockGuard(self)
@@ -709,15 +806,10 @@ class IfElse(object):
     IN_IF_ELSE_TRUE_BLOCKS = 1
     IN_IF_ELSE_FALSE_BLOCKS = 2
 
-    def __init__(self, cond, name=None, main_program=None,
-                 startup_program=None):
+    def __init__(self, cond, name=None):
         if not isinstance(cond, Variable):
             raise TypeError("cond must be a Variable")
-        self.helper = LayerHelper(
-            'ifelse',
-            name=name,
-            main_program=main_program,
-            startup_program=startup_program)
+        self.helper = LayerHelper('ifelse', name=name)
         self.cond = cond
         self.input_table = {}
         self.status = IfElse.OUT_IF_ELSE_BLOCKS
@@ -782,11 +874,7 @@ class IfElse(object):
             out_table.append(outside_out)
 
             # assign local var to outside
-            assign(
-                input=each_out,
-                output=outside_out,
-                main_program=self.helper.main_program,
-                startup_program=self.helper.startup_program)
+            assign(input=each_out, output=outside_out)
 
     def __call__(self):
         if self.status != self.OUT_IF_ELSE_BLOCKS:
@@ -810,9 +898,7 @@ class IfElse(object):
                     in_false=false_var,
                     mask=self.cond,
                     x=self.cond,
-                    level=0,
-                    main_program=self.helper.main_program,
-                    startup_program=self.helper.startup_program))
+                    level=0))
         return rlist
 
 
@@ -821,12 +907,8 @@ class DynamicRNN(object):
     IN_RNN = 1
     AFTER_RNN = 2
 
-    def __init__(self, name=None, main_program=None, startup_program=None):
-        self.helper = LayerHelper(
-            'dynamic_rnn',
-            name=name,
-            main_program=main_program,
-            startup_program=startup_program)
+    def __init__(self, name=None):
+        self.helper = LayerHelper('dynamic_rnn', name=name)
         self.status = DynamicRNN.BEFORE_RNN
         self.lod_rank_table = None
         self.max_seq_len = None
@@ -880,8 +962,7 @@ class DynamicRNN(object):
             inputs={'X': x,
                     'RankTable': self.lod_rank_table},
             outputs={'Out': input_array})
-        return array_read(
-            array=input_array, i=self.step_idx, **self.helper.to_kwargs)
+        return array_read(array=input_array, i=self.step_idx)
 
     @contextlib.contextmanager
     def block(self):
@@ -892,32 +973,18 @@ class DynamicRNN(object):
         self.status = DynamicRNN.IN_RNN
         with self.while_op.block():
             yield
-            increment(
-                x=self.step_idx,
-                value=1.0,
-                in_place=True,
-                **self.helper.to_kwargs)
+            increment(x=self.step_idx, value=1.0, in_place=True)
 
             for new_mem, mem_array in self.mem_link:
-                array_write(
-                    x=new_mem,
-                    i=self.step_idx,
-                    array=mem_array,
-                    **self.helper.to_kwargs)
-
-            less_than(
-                x=self.step_idx,
-                y=self.max_seq_len,
-                cond=self.cond,
-                **self.helper.to_kwargs)
+                array_write(x=new_mem, i=self.step_idx, array=mem_array)
+
+            less_than(x=self.step_idx, y=self.max_seq_len, cond=self.cond)
 
         self.status = DynamicRNN.AFTER_RNN
         for each_array in self.output_array:
             self.outputs.append(
                 array_to_lod_tensor(
-                    x=each_array,
-                    table=self.lod_rank_table,
-                    **self.helper.to_kwargs))
+                    x=each_array, table=self.lod_rank_table))
 
     def __call__(self, *args, **kwargs):
         if self.status != DynamicRNN.AFTER_RNN:
@@ -944,13 +1011,9 @@ class DynamicRNN(object):
                 inputs={'X': init,
                         'I': self.zero_idx},
                 outputs={'Out': mem_array})
-            retv = array_read(
-                array=mem_array, i=self.step_idx, **self.helper.to_kwargs)
+            retv = array_read(array=mem_array, i=self.step_idx)
             retv = shrink_memory(
-                x=retv,
-                i=self.step_idx,
-                table=self.lod_rank_table,
-                **self.helper.to_kwargs)
+                x=retv, i=self.step_idx, table=self.lod_rank_table)
             self.mem_dict[retv.name] = mem_array
             return retv
         else:
@@ -1020,3 +1083,18 @@ class DynamicRNN(object):
         if self.status != DynamicRNN.IN_RNN:
             raise ValueError("{0} can only be invoked inside rnn block.".format(
                 method))
+
+
+@autodoc
+def reorder_lod_tensor_by_rank(x, rank_table):
+    helper = LayerHelper('reorder_lod_tensor_by_rank', **locals())
+    helper.is_instance('x', Variable)
+    helper.is_instance('rank_table', Variable)
+
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='reorder_lod_tensor_by_rank',
+        inputs={'X': [x],
+                'RankTable': [rank_table]},
+        outputs={'Out': [out]})
+    return out
diff --git a/python/paddle/v2/fluid/layers/io.py b/python/paddle/v2/fluid/layers/io.py
index f03d8e3c3e8797619adf837b28ed66ece7db295e..56c3f7b7b7f174338bb56bc5785423ca634650a6 100644
--- a/python/paddle/v2/fluid/layers/io.py
+++ b/python/paddle/v2/fluid/layers/io.py
@@ -10,24 +10,11 @@ def data(name,
          dtype='float32',
          lod_level=0,
          type=core.VarDesc.VarType.LOD_TENSOR,
-         main_program=None,
-         startup_program=None,
          stop_gradient=True):
     """
-    Data Layer.
+    **Data Layer**
 
-    Args:
-       name: The name/alias of the function
-       shape: Tuple declaring the shape.
-       append_batch_size: Whether or not to append the data as a batch.
-       dtype: The type of data : float32, float_16, int etc
-       type: The output type. By default it is LOD_TENSOR.
-       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
-       stop_gradient: A boolean that mentions whether gradient should flow.
-
-    This function takes in input and based on whether data has
+    This function takes in the input and based on whether data has
     to be returned back as a minibatch, it creates the global variable using
     the helper functions. The global variables can be accessed by all the
     following operations and layers in the graph.
@@ -35,6 +22,24 @@ def data(name,
     All the input variables of this function are passed in as local variables
     to the LayerHelper constructor.
 
+    Args:
+       name(str): The name/alias of the function
+       shape(list): Tuple declaring the shape.
+       append_batch_size(bool): Whether or not to append the data as a batch.
+       dtype(int|float): The type of data : float32, float_16, int etc
+       type(VarType): The output type. By default it is LOD_TENSOR.
+       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
+       main_program(Program): Name of the main program that calls this
+       startup_program(Program): Name of the startup program
+       stop_gradient(bool): A boolean that mentions whether gradient should flow.
+
+    Returns:
+        Variable: The global variable that gives access to the data.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='x', shape=[784], dtype='float32')
     """
     helper = LayerHelper('data', **locals())
     shape = list(shape)
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index f231f38b3e9f5666079623b99559b044b3c9bcc2..55b35ad543b2350915574d7f9b4ef460156ee459 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -5,12 +5,16 @@ All layers just related to the neural network.
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
+from ..param_attr import ParamAttr
+from tensor import concat
 
 __all__ = [
     'fc', 'embedding', 'dynamic_lstm', 'gru_unit', 'linear_chain_crf',
     'crf_decoding', 'cos_sim', 'cross_entropy', 'square_error_cost', 'accuracy',
     'chunk_eval', 'sequence_conv', 'conv2d', 'sequence_pool', 'pool2d',
-    'batch_norm', 'beam_search_decode', 'conv2d_transpose'
+    'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'sequence_expand',
+    'lstm_unit', 'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min',
+    'sequence_first_step', 'sequence_last_step'
 ]
 
 
@@ -20,38 +24,85 @@ def fc(input,
        param_attr=None,
        bias_attr=None,
        act=None,
-       name=None,
-       main_program=None,
-       startup_program=None):
+       name=None):
     """
-    Fully Connected Layer.
+    **Fully Connected Layer**
+
+    The fully connected layer can take multiple tensors as its inputs. It
+    creates a variable (one for each input tensor) called weights for each input
+    tensor, which represents a fully connected weight matrix from each input
+    unit to each output unit. The fully connected layer multiplies each input
+    tensor with its coresponding weight to produce an output Tensor. If
+    multiple input tensors are given, the results of multiple multiplications
+    will be sumed up. If bias_attr is not None, a biases variable will be
+    created and added to the output. Finally, if activation is not None,
+    it will be applied to the output as well.
+
+    This process can be formulated as follows:
+
+    .. math::
+
+        Out = Act({\sum_{i=0}^{N-1}W_iX_i + b})
+
+    In the above equation:
+
+    * :math:`N`: Number of the input.
+    * :math:`X_i`: The input tensor.
+    * :math:`W`: The weights created by this layer.
+    * :math:`b`: The bias parameter created by this layer (if needed).
+    * :math:`Act`: The activation funtion.
+    * :math:`Out`: The output tensor.
 
     Args:
-       input: The input tensor to the function
-       size: The size of the layer
-       num_flatten_dims: Number of columns in input
-       param_attr: The parameters/weights to the FC Layer
-       param_initializer: Initializer used for the weight/parameter. If None, XavierInitializer() is used
-       bias_attr: The bias parameter for the FC layer
-       bias_initializer: Initializer used for the bias. If None, then ConstantInitializer() is used
-       act: Activation to be applied to the output of FC layer
-       name: Name/alias of the function
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
+       input(Variable|list): The input tensor(s) to the fully connected layer.
+       size(int): The number of output units in the fully connected layer.
+       num_flatten_dims(int): The fc layer can accept an input tensor with more
+                              than two dimensions. If this happens, the
+                              multidimensional tensor will first be flattened
+                              into a 2-dimensional matrix. The parameter
+                              `num_flatten_dims` determines how the input tensor
+                              is flattened: the first `num_flatten_dims`
+                              dimensions will be flatten to form the first
+                              dimension of the final matrix (height of the
+                              matrix), and the rest `rank(X) - num_col_dims`
+                              dimensions are flattened to form the second
+                              dimension of the final matrix (width of the matrix).
+                              For example, suppose `X` is a 6-dimensional tensor
+                              with a shape [2, 3, 4, 5, 6], and
+                              `x_num_col_dims` = 3. Then, the flattened matrix
+                              will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
+                              By default, `x_num_col_dims` is set to 1.
+       param_attr(ParamAttr|list): The parameter attribute for learnable
+                                   parameters/weights of the fully connected
+                                   layer.
+       param_initializer(ParamAttr|list): The initializer used for the
+                                          weight/parameter. If set None,
+                                          XavierInitializer() will be used.
+       bias_attr(ParamAttr|list): The parameter attribute for the bias parameter
+                                  for this layer. If set None, no bias will be
+                                  added to the output units.
+       bias_initializer(ParamAttr|list): The initializer used for the bias.
+                                        If set None, then ConstantInitializer()
+                                        will be used.
+       act(str): Activation to be applied to the output of the fully connected
+                 layer.
+       name(str): Name/alias of the fully connected layer.
 
-    This function can take in multiple inputs and performs the Fully Connected
-    function (linear transformation) on top of each of them.
-    So for input x, the output will be : Wx + b. Where W is the parameter,
-    b the bias and x is the input.
 
-    The function also applies an activation (non-linearity) on top of the
-    output, if activation is passed in the input.
+    Returns:
+        Variable: The output tensor variable.
 
-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
+    Raises:
+        ValueError: If rank of the input tensor is less than 2.
 
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          fc = fluid.layers.fc(input=data, size=1000, act="tanh")
     """
-    helper = LayerHelper('fc', **locals())
+
+    helper = LayerHelper("fc", **locals())
 
     dtype = helper.input_dtype()
 
@@ -71,8 +122,8 @@ def fc(input,
                 "Y": w,
             },
             outputs={"Out": tmp},
-            attrs={'x_num_col_dims': num_flatten_dims,
-                   'y_num_col_dims': 1})
+            attrs={"x_num_col_dims": num_flatten_dims,
+                   "y_num_col_dims": 1})
         mul_results.append(tmp)
 
     # sum
@@ -88,33 +139,33 @@ def fc(input,
     return helper.append_activation(pre_activation)
 
 
-def embedding(input,
-              size,
-              is_sparse=False,
-              param_attr=None,
-              dtype='float32',
-              main_program=None,
-              startup_program=None):
+def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'):
     """
-    Embedding Layer.
+    **Embedding Layer**
+
+    This layer is used to lookup a vector of IDs, provided by *input*, in a lookup table.
+    The result of this lookup is the embedding of each ID in the *input*.
+
+    All the input variables are passed in as local variables to the LayerHelper
+    constructor.
 
     Args:
-       param_initializer:
-       input: The input to the function
-       size: The size of the layer
-       is_sparse: A flag that decleares whether the input is sparse
-       param_attr: Parameters for this layer
-       dtype: The type of data : float32, float_16, int etc
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
+       input(Variable): Input to the function
+       size(tuple|list|None): Shape of the look up table parameter 
+       is_sparse(bool): Boolean flag that specifying whether the input is sparse
+       param_attr(ParamAttr): Parameters for this layer
+       dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc
 
-    This function can take in the input (which is a vector of IDs) and
-    performs a lookup in the lookup_table using these IDs, to result into
-    the embedding of each ID in the input.
+    Returns:
+        Variable: The tensor variable storing the embeddings of the \
+                  supplied inputs.
 
-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
+    Examples:
+        .. code-block:: python
 
+          dict_size = len(dataset.ids)
+          data = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
+          fc = fluid.layers.embedding(input=data, size=[dict_size, 16])
     """
 
     helper = LayerHelper('embedding', **locals())
@@ -140,9 +191,7 @@ def dynamic_lstm(input,
                  gate_activation='sigmoid',
                  cell_activation='tanh',
                  candidate_activation='tanh',
-                 dtype='float32',
-                 main_program=None,
-                 startup_program=None):
+                 dtype='float32'):
     helper = LayerHelper('lstm', **locals())
     size = size / 4
     weight = helper.create_parameter(
@@ -185,9 +234,7 @@ def gru_unit(input,
              weight=None,
              bias=None,
              activation='tanh',
-             gate_activation='sigmoid',
-             main_program=None,
-             startup_program=None):
+             gate_activation='sigmoid'):
     """
     GRUUnit Operator implements partial calculations of the GRU unit as following:
 
@@ -223,6 +270,7 @@ def gru_unit(input,
             attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
 
     # create bias
+
     if bias is None:
         bias_size = [1, 3 * size]
         bias = helper.create_parameter(
@@ -250,11 +298,7 @@ def gru_unit(input,
     return updated_hidden, reset_hidden_pre, gate
 
 
-def linear_chain_crf(input,
-                     label,
-                     param_attr=None,
-                     main_program=None,
-                     startup_program=None):
+def linear_chain_crf(input, label, param_attr=None):
     helper = LayerHelper('linear_chain_crf', **locals())
     size = input.shape[1]
     transition = helper.create_parameter(
@@ -280,11 +324,7 @@ def linear_chain_crf(input,
     return log_likelihood
 
 
-def crf_decoding(input,
-                 param_attr,
-                 label=None,
-                 main_program=None,
-                 startup_program=None):
+def crf_decoding(input, param_attr, label=None):
     helper = LayerHelper('crf_decoding', **locals())
     transition = helper.get_parameter(param_attr.name)
     viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -319,7 +359,59 @@ def cos_sim(X, Y, **kwargs):
 
 def cross_entropy(input, label, **kwargs):
     """
-    This function computes cross_entropy using the input and label.
+    **Cross Entropy Layer**
+
+    This layer computes the cross entropy between `input` and `label`. It supports
+    both standard cross-entropy and soft-label cross-entropy loss computation.
+
+    1) One-hot cross-entropy:
+	`soft_label = False`, `Label[i, 0]` indicates the class index for sample i:
+        
+        .. math::
+          
+            Y[i] = -\log(X[i, Label[i]])
+
+    2) Soft-label cross-entropy:
+	`soft_label = True`, `Label[i, j]` indicates the soft label of class j
+	for sample i:
+
+        .. math::
+
+            Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
+
+       Please make sure that in this case the summation of each row of `label`
+       equals one.
+
+    3) One-hot cross-entropy with vecterized `label`:
+	 As a special case of 2), when each row of 'label' has only one
+	 non-zero element which is equal to 1, soft-label cross-entropy degenerates
+         to a one-hot cross-entropy with one-hot label representation.
+    
+    Args:
+        input (Variable|list):  a 2-D tensor with shape [N x D], where N is the 
+            batch size and D is the number of classes. This input is a probability 
+            computed by the previous operator, which is almost always the result
+            of a softmax operator.
+        label (Variable|list): the ground truth which is a 2-D tensor. When 
+              `soft_label` is set to `False`, `label` is a tensor<int64> with shape 
+              [N x 1]. When `soft_label` is set to `True`, `label` is a 
+              tensor<float/double> with shape [N x D].
+        soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate
+              the given labels as soft labels, default `False`.
+
+    Returns:
+         A 2-D tensor with shape [N x 1], the cross entropy loss.
+
+    Raises:
+        `ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \ 
+              `soft_label == True`, and the 2nd dimension of `input` and `label` are not \
+               equal; 3) when `soft_label == False`, and the 2nd dimension of `label` is not 1.
+
+    Examples:
+        .. code-block:: python
+
+          predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+          cost = fluid.layers.cross_entropy(input=predict, label=label)
     """
     helper = LayerHelper('cross_entropy', **kwargs)
     out = helper.create_tmp_variable(dtype=input.dtype)
@@ -347,7 +439,8 @@ def square_error_cost(input, label, **kwargs):
 
     square_out = helper.create_tmp_variable(dtype=input.dtype)
     helper.append_op(
-        type='square', inputs={'X': [minus_out]}, outputs={'Y': [square_out]})
+        type='square', inputs={'X': [minus_out]},
+        outputs={'Out': [square_out]})
     return square_out
 
 
@@ -392,8 +485,8 @@ def chunk_eval(input,
                excluded_chunk_types=None,
                **kwargs):
     """
-    This function computes the accuracy using the input and label.
-    The output is the top_k inputs and their indices.
+    This function computes and outputs the precision, recall and
+    F1-score of chunk detection.
     """
     helper = LayerHelper("chunk_eval", **kwargs)
 
@@ -401,6 +494,9 @@ def chunk_eval(input,
     precision = helper.create_tmp_variable(dtype="float32")
     recall = helper.create_tmp_variable(dtype="float32")
     f1_score = helper.create_tmp_variable(dtype="float32")
+    num_infer_chunks = helper.create_tmp_variable(dtype="int64")
+    num_label_chunks = helper.create_tmp_variable(dtype="int64")
+    num_correct_chunks = helper.create_tmp_variable(dtype="int64")
 
     helper.append_op(
         type="chunk_eval",
@@ -409,14 +505,17 @@ def chunk_eval(input,
         outputs={
             "Precision": [precision],
             "Recall": [recall],
-            "F1-Score": [f1_score]
+            "F1-Score": [f1_score],
+            "NumInferChunks": [num_infer_chunks],
+            "NumLabelChunks": [num_label_chunks],
+            "NumCorrectChunks": [num_correct_chunks]
         },
         attrs={
             "num_chunk_types": num_chunk_types,
-            'chunk_scheme': chunk_scheme,
-            'excluded_chunk_types': excluded_chunk_types or []
+            "chunk_scheme": chunk_scheme,
+            "excluded_chunk_types": excluded_chunk_types or []
         })
-    return precision, recall, f1_score
+    return precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks
 
 
 def sequence_conv(input,
@@ -426,9 +525,7 @@ def sequence_conv(input,
                   padding=None,
                   bias_attr=None,
                   param_attr=None,
-                  act=None,
-                  main_program=None,
-                  startup_program=None):
+                  act=None):
     """
     This function creates the op for sequence_conv, using the inputs and
     other convolutional configurations for the filters and stride as given
@@ -470,16 +567,83 @@ def conv2d(input,
            groups=None,
            param_attr=None,
            bias_attr=None,
-           act=None,
-           name=None,
-           main_program=None,
-           startup_program=None):
+           act=None):
     """
-    This function creates the op for a 2-dimensional Convolution.
-    This is performed using the parameters of filters(size, dimensionality etc)
-    , stride and other configurations for a Convolution operation.
-    This funciton can also append an activation on top of the
-    conv-2d output, if mentioned in the input parameters.
+    **Convlution2D Layer**
+
+    The convolution2D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input(Input) and Output(Output)
+    are in NCHW format. Where N is batch size, C is the number of channels, H is the height
+    of the feature, and W is the width of the feature.
+    The details of convolution layer, please refer UFLDL's `convolution,
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
+    If bias attribution and activation type are provided, bias is added to the output of the convolution,
+    and the corresponding activation function is applied to the final result.
+    For each input :math:`X`, the equation is:
+
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    In the above equation:
+
+        * :math:`X`: Input value, a tensor with NCHW format.
+        * :math:`W`: Filter value, a tensor with MCHW format.
+        * :math:`\\ast`: Convolution operation.
+        * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+        * :math:`\\sigma`: Activation function.
+        * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        Input:
+            Input shape: $(N, C_{in}, H_{in}, W_{in})$
+
+            Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+
+        Output:
+            Output shape: $(N, C_{out}, H_{out}, W_{out})$
+        Where
+    .. math::
+
+        H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+        W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+
+    Args:
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        groups(int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None
+        bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        act(str): Activation type. Default: None
+
+    Returns:
+        Variable: The tensor variable storing the convolution and \
+                  non-linearity activation result.
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and groups mismatch.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
     """
 
     if stride is None:
@@ -535,9 +699,53 @@ def conv2d(input,
 
 def sequence_pool(input, pool_type, **kwargs):
     """
-    This function add the operator for sequence pooling.
-    This is applied on top of the input using pool_type mentioned
-    in the parameters.
+    This function add the operator for sequence pooling. 
+    It pools features of all time-steps of each instance, and is applied 
+    on top of the input using pool_type mentioned in the parameters. 
+
+    It supports four pool_type:
+
+    - average: :math:`Out[i] = \\frac{\sum_i X_i}{N}`
+    - sum:     :math:`Out[i] = \sum_jX_{ij}`
+    - sqrt:    :math:`Out[i] = \\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}`
+    - max:     :math:`Out[i] = max(X_i)`
+
+    .. code-block:: text
+
+       x is a 1-level LoDTensor:
+         x.lod = [[0, 2, 5, 7]]
+         x.data = [1, 3, 2, 4, 6, 5, 1]
+         x.dims = [7, 1]
+
+       then output is a Tensor:
+         out.dim = [3, 1]
+         with condition len(x.lod[-1]) - 1 == out.dims[0]
+
+       for different pool_type:
+         average: out.data = [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
+         sum    : out.data = [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
+         sqrt   : out.data = [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
+                    6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
+         max    : out.data = [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
+
+    Args:
+        input(variable): The input variable which is a LoDTensor.
+        pool_type (string): The pooling type of sequence_pool. 
+            It supports average, sum, sqrt and max.
+
+    Returns:
+        The sequence pooling variable which is a Tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+             x = fluid.layers.data(name='x', shape=[7, 1], 
+                              dtype='float32', lod_level=1)
+             avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
+             sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
+             sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt')
+             max_x = fluid.layers.sequence_pool(input=x, pool_type='max')
     """
     helper = LayerHelper('sequence_pool', input=input, **kwargs)
     dtype = helper.input_dtype()
@@ -554,14 +762,78 @@ def sequence_pool(input, pool_type, **kwargs):
     return pool_out
 
 
+def sequence_first_step(input, **kwargs):
+    """
+    This funciton get the first step of sequence.
+
+    .. code-block:: text
+
+       x is a 1-level LoDTensor:
+         x.lod = [[0, 2, 5, 7]]
+         x.data = [1, 3, 2, 4, 6, 5, 1]
+         x.dims = [7, 1]
+
+       then output is a Tensor:
+         out.dim = [3, 1]
+         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         out.data = [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
+
+    Args:
+        input(variable): The input variable which is a LoDTensor.
+
+    Returns:
+        The sequence's first step variable which is a Tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+             x = fluid.layers.data(name='x', shape=[7, 1], 
+                              dtype='float32', lod_level=1)
+             x_first_step = fluid.layers.sequence_first_step(input=x)
+    """
+    return sequence_pool(input=input, pool_type="first")
+
+
+def sequence_last_step(input, **kwargs):
+    """
+    This funciton get the last step of sequence.
+
+    .. code-block:: text
+
+       x is a 1-level LoDTensor:
+         x.lod = [[0, 2, 5, 7]]
+         x.data = [1, 3, 2, 4, 6, 5, 1]
+         x.dims = [7, 1]
+
+       then output is a Tensor:
+         out.dim = [3, 1]
+         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
+
+    Args:
+        input(variable): The input variable which is a LoDTensor.
+
+    Returns:
+        The sequence's last step variable which is a Tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+             x = fluid.layers.data(name='x', shape=[7, 1], 
+                              dtype='float32', lod_level=1)
+             x_last_step = fluid.layers.sequence_last_step(input=x)
+    """
+    return sequence_pool(input=input, pool_type="last")
+
+
 def pool2d(input,
            pool_size,
            pool_type,
            pool_stride=None,
            pool_padding=None,
-           global_pooling=False,
-           main_program=None,
-           startup_program=None):
+           global_pooling=False):
     """
     This function adds the operator for pooling in 2 dimensions, using the
     pooling configurations mentioned in input parameters.
@@ -607,9 +879,7 @@ def batch_norm(input,
                epsilon=1e-05,
                param_attr=None,
                bias_attr=None,
-               data_layout='NCHW',
-               main_program=None,
-               startup_program=None):
+               data_layout='NCHW'):
     """
     This function helps create an operator to implement
     the BatchNorm layer using the configurations from the input parameters.
@@ -679,7 +949,7 @@ def batch_norm(input,
     return helper.append_activation(batch_norm_out)
 
 
-def beam_search_decode(ids, scores, main_program=None, startup_program=None):
+def beam_search_decode(ids, scores):
     helper = LayerHelper('beam_search_decode', **locals())
     sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
     sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
@@ -702,9 +972,8 @@ def conv2d_transpose(input,
                      filter_size=None,
                      padding=None,
                      stride=None,
-                     param_attr=None,
-                     main_program=None,
-                     startup_program=None):
+                     dilation=None,
+                     param_attr=None):
     """
     The transpose of conv2d layer.
 
@@ -727,6 +996,9 @@ def conv2d_transpose(input,
         stride(int|tuple): The stride size. If stride is a tuple, it must
             contain two integers, (stride_H, stride_W). Otherwise, the
             stride_H = stride_W = stride.
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation.
         param_attr: Parameter Attribute.
         main_program(Program): the main program
         startup_program(Program): the startup program
@@ -747,10 +1019,15 @@ def conv2d_transpose(input,
         op_attr['paddings'] = padding
 
     if isinstance(stride, int):
-        op_attr['strides'] = stride
+        op_attr['strides'] = [stride, stride]
     elif stride is not None:
         op_attr['strides'] = stride
 
+    if isinstance(dilation, int):
+        op_attr['dilations'] = [dilation, dilation]
+    elif dilation is not None:
+        op_attr['dilations'] = dilation
+
     if filter_size is None:
         if output_size is None:
             raise ValueError("output_size must be set when filter_size is None")
@@ -759,14 +1036,17 @@ def conv2d_transpose(input,
 
         padding = op_attr.get('paddings', [0, 0])
         stride = op_attr.get('strides', [1, 1])
+        dilation = op_attr.get('dilations', [1, 1])
 
         h_in = input.shape[2]
         w_in = input.shape[3]
-        filter_size_h = output_size[0] - \
-                        (h_in - 1) * stride[0] + 2 * padding[0]
-        filter_size_w = output_size[1] - \
-                        (w_in - 1) * stride[1] + 2 * padding[1]
+
+        filter_size_h = (output_size[0] - (h_in - 1) * stride[0] + 2 *
+                         padding[0] - 1) / dilation[0] + 1
+        filter_size_w = (output_size[1] - (w_in - 1) * stride[1] + 2 *
+                         padding[1] - 1) / dilation[1] + 1
         filter_size = [filter_size_h, filter_size_w]
+
     elif isinstance(filter_size, int):
         filter_size = [filter_size, filter_size]
 
@@ -783,3 +1063,352 @@ def conv2d_transpose(input,
         attrs=op_attr)
 
     return out
+
+
+def sequence_expand(x, y):
+    """Sequence Expand Layer. This layer will expand the input variable **x**
+    according to LoD information of **y**. And the following examples will
+    explain how sequence_expand works:
+
+    .. code-block:: text
+
+        * Case 1
+            x is a LoDTensor:
+                x.lod = [[0,       2, 3],
+                         [0, 1,    3, 4]]
+                x.data = [a, b, c, d]
+                x.dims = [4, 1]
+
+            y is a LoDTensor:
+                y.lod = [[0,    2,    4],
+                         [0, 3, 6, 7, 8]]
+
+            with condition len(y.lod[-1]) - 1 == x.dims[0]
+
+            then output is a 2-level LoDTensor:
+                out.lod = [[0,                2,    4],
+                           [0,       3,       6, 7, 8]]
+                out.data = [a, a, a, b, b, b, c, d]
+                out.dims = [8, 1]
+
+        * Case 2
+            x is a Tensor:
+                x.data = [a, b, c]
+                x.dims = [3, 1]
+
+            y is a LoDTensor:
+                y.lod = [[0, 2, 3, 6]]
+
+            with condition len(y.lod[-1]) - 1 == x.dims[0]
+
+            then output is a 1-level LoDTensor:
+                out.lod = [[0,    2, 3,      6]]
+                out.data = [a, a, b, c, c, c]
+                out.dims = [6, 1]
+
+    Args:
+        x (Variable): The input variable which is a Tensor or LoDTensor.
+        y (Variable): The input variable which is a LoDTensor.
+
+    Returns:
+        Variable: The expanded variable which is a LoDTensor.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[10, 20],
+                             dtype='float32', lod_level=1)
+            out = layers.sequence_expand(x=x, y=y)
+    """
+    helper = LayerHelper('sequence_expand', input=x, **locals())
+    dtype = helper.input_dtype()
+    tmp = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='sequence_expand', inputs={'X': x,
+                                        'Y': y}, outputs={'Out': tmp})
+    return tmp
+
+
+def lstm_unit(x_t,
+              hidden_t_prev,
+              cell_t_prev,
+              forget_bias=0.0,
+              param_attr=None,
+              bias_attr=None):
+    """Lstm unit layer. The equation of a lstm step is:
+
+        .. math::
+
+            i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
+
+            f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
+
+            c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
+
+            o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
+
+            h_t & = o_t tanh(c_t)
+
+    The inputs of lstm unit includes :math:`x_t`, :math:`h_{t-1}` and
+    :math:`c_{t-1}`. The implementation separates the linear transformation
+    and non-linear transformation apart. Here, we take :math:`i_t` as an
+    example. The linear transformation is applied by calling a `fc` layer and
+    the equation is:
+
+        .. math::
+
+            L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i
+
+    The non-linear transformation is applied by calling `lstm_unit_op` and the
+    equation is:
+
+        .. math::
+
+            i_t = \sigma(L_{i_t})
+
+    This layer has two outputs including :math:`h_t` and :math:`o_t`.
+
+    Args:
+        x_t (Variable): The input value of current step.
+        hidden_t_prev (Variable): The hidden value of lstm unit.
+        cell_t_prev (Variable): The cell value of lstm unit.
+        forget_bias (float): The forget bias of lstm unit.
+        param_attr (ParamAttr): The attributes of parameter weights, used to set
+            initializer, name etc.
+        bias_attr (ParamAttr): The attributes of bias weights, if not False,
+            bias weights will be created and be set to default value.
+
+    Returns:
+        tuple: The hidden value and cell value of lstm unit.
+
+    Raises:
+        ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\
+                not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** \
+                and **cell_t_prev** not be the same.
+
+    Examples:
+
+        .. code-block:: python
+
+             x_t = fluid.layers.fc(input=x_t_data, size=10)
+             prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=20)
+             prev_cell = fluid.layers.fc(input=prev_cell_data, size=30)
+             hidden_value, cell_value = fluid.layers.lstm_unit(x_t=x_t,
+                                                    hidden_t_prev=prev_hidden,
+                                                    cell_t_prev=prev_cell)
+    """
+    helper = LayerHelper('lstm_unit', **locals())
+
+    if len(x_t.shape) != 2:
+        raise ValueError("Rank of x_t must be 2.")
+
+    if len(hidden_t_prev.shape) != 2:
+        raise ValueError("Rank of hidden_t_prev must be 2.")
+
+    if len(cell_t_prev.shape) != 2:
+        raise ValueError("Rank of cell_t_prev must be 2.")
+
+    if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[
+            0] != cell_t_prev.shape[0]:
+        raise ValueError("The 1s dimension of x_t, hidden_t_prev and "
+                         "cell_t_prev must be the same.")
+
+    if bias_attr is None:
+        bias_attr = ParamAttr()
+
+    size = cell_t_prev.shape[1]
+    concat_out = concat(input=[x_t, hidden_t_prev], axis=1)
+    fc_out = fc(input=concat_out,
+                size=4 * size,
+                param_attr=param_attr,
+                bias_attr=bias_attr)
+    dtype = x_t.dtype
+    c = helper.create_tmp_variable(dtype)
+    h = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='lstm_unit',
+        inputs={"X": fc_out,
+                "C_prev": cell_t_prev},
+        outputs={"C": c,
+                 "H": h},
+        attrs={"forget_bias": forget_bias})
+
+    return h, c
+
+
+def reduce_sum(input, dim=None, keep_dim=False):
+    """
+    Computes the sum of tensor elements over the given dimension. 
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int|None): The dimension along which the sum is performed. If 
+            :attr:`None`, sum all elements of :attr:`input` and return a 
+            Tensor variable with a single element, otherwise must be in the 
+            range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`, 
+            the dimension to reduce is :math:`rank + dim`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the 
+            output Tensor. The result tensor will have one fewer dimension 
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_sum(x)  # [3.5]
+            fluid.layers.reduce_sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
+            fluid.layers.reduce_sum(x, dim=-1)  # [1.9, 1.6]
+            fluid.layers.reduce_sum(x, dim=1, keep_dim=True)  # [[1.9], [1.6]]
+    """
+    helper = LayerHelper('reduce_sum', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_sum',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else 0,
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
+def reduce_mean(input, dim=None, keep_dim=False):
+    """
+    Computes the mean of tensor elements over the given dimension. 
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int|None): The dimension along which the mean is computed. If 
+            :attr:`None`, compute the mean over all elements of :attr:`input` 
+            and return a Tensor variable with a single element, otherwise 
+            must be in the range :math:`[-rank(input), rank(input))`. If 
+            :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the 
+            output Tensor. The result tensor will have one fewer dimension 
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_mean(x)  # [0.4375]
+            fluid.layers.reduce_mean(x, dim=0)  # [0.15, 0.25, 0.55, 0.8]
+            fluid.layers.reduce_mean(x, dim=-1)  # [0.475, 0.4]
+            fluid.layers.reduce_mean(x, dim=1, keep_dim=True)  # [[0.475], [0.4]]
+    """
+    helper = LayerHelper('reduce_mean', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_mean',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else 0,
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
+def reduce_max(input, dim=None, keep_dim=False):
+    """
+    Computes the maximum of tensor elements over the given dimension. 
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int|None): The dimension along which the maximum is computed. 
+            If :attr:`None`, compute the maximum over all elements of 
+            :attr:`input` and return a Tensor variable with a single element, 
+            otherwise must be in the range :math:`[-rank(input), rank(input))`. 
+            If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the 
+            output Tensor. The result tensor will have one fewer dimension 
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+    
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_max(x)  # [0.9]
+            fluid.layers.reduce_max(x, dim=0)  # [0.2, 0.3, 0.6, 0.9]
+            fluid.layers.reduce_max(x, dim=-1)  # [0.9, 0.7]
+            fluid.layers.reduce_max(x, dim=1, keep_dim=True)  # [[0.9], [0.7]]
+    """
+    helper = LayerHelper('reduce_max', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_max',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else 0,
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
+def reduce_min(input, dim=None, keep_dim=False):
+    """
+    Computes the minimum of tensor elements over the given dimension. 
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int|None): The dimension along which the minimum is computed. 
+            If :attr:`None`, compute the minimum over all elements of 
+            :attr:`input` and return a Tensor variable with a single element, 
+            otherwise must be in the range :math:`[-rank(input), rank(input))`. 
+            If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the 
+            output Tensor. The result tensor will have one fewer dimension 
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+    
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_min(x)  # [0.1]
+            fluid.layers.reduce_min(x, dim=0)  # [0.1, 0.2, 0.5, 0.7]
+            fluid.layers.reduce_min(x, dim=-1)  # [0.2, 0.1]
+            fluid.layers.reduce_min(x, dim=1, keep_dim=True)  # [[0.2], [0.1]]
+    """
+    helper = LayerHelper('reduce_min', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_min',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else 0,
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
index fa312ace60390e5fdd9637dccc71ccf8b247ca47..d2ff6841a317aaf6903edadc9213f69ef6c41216 100644
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -2,7 +2,7 @@ from ..registry import register_layer
 __all__ = [
     'mean', 'mul', 'dropout', 'reshape', 'sigmoid', 'scale', 'transpose',
     'sigmoid_cross_entropy_with_logits', 'elementwise_add', 'elementwise_div',
-    'elementwise_sub', 'elementwise_mul', 'clip', 'abs'
+    'elementwise_sub', 'elementwise_mul', 'clip', 'abs', 'sequence_softmax'
 ]
 
 for _OP in set(__all__):
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
index a839ed897d7a9d4b238a8551b2255b87f207caee..e5820d24cd2b34ef53cbb91e2be66efc1b74d315 100644
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -6,12 +6,12 @@ __all__ = [
 ]
 
 
-def create_tensor(dtype, name=None, main_program=None, startup_program=None):
+def create_tensor(dtype, name=None):
     helper = LayerHelper("create_tensor", **locals())
     return helper.create_variable(name=helper.name, dtype=dtype)
 
 
-def cast(x, dtype, main_program=None):
+def cast(x, dtype):
     """
     This function takes in the input with input_dtype
     and casts it to the output_dtype as the output.
@@ -27,10 +27,23 @@ def cast(x, dtype, main_program=None):
     return out
 
 
-def concat(input, axis, main_program=None, startup_program=None):
+def concat(input, axis=0):
     """
-    This function concats the input along the axis mentioned
+    **Concat**
+
+    This function concatenates the input along the axis mentioned
     and returns that as the output.
+
+    Args:
+        input(list): List of tensors to be concatenated
+        axis(int): Integer axis along which the tensors will be concatenated
+
+    Returns:
+        Variable: Output variable of the concatenation
+
+    Examples:
+        .. code-block:: python
+          out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
     """
     helper = LayerHelper('concat', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -42,10 +55,29 @@ def concat(input, axis, main_program=None, startup_program=None):
     return out
 
 
-def sums(input, out=None, main_program=None, startup_program=None):
-    """
-    This function takes in the input and performs the sum operation on it
-    and returns that as the output.
+def sums(input, out=None):
+    """This function performs the sum operation on the input and returns the
+    result as the output.
+
+    Args:
+        input (Variable|list): The input tensor that has the elements
+                               that need to be summed up.
+
+    Returns:
+        Variable: The tensor type variable that has the sum of input
+                  written to it.
+
+    Examples:
+        .. code-block::python
+
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          a0 = layers.array_read(array=tmp, i=i)
+          i = layers.increment(x=i)
+          a1 = layers.array_read(array=tmp, i=i)
+          mean_a0 = layers.mean(x=a0)
+          mean_a1 = layers.mean(x=a1)
+          a_sum = layers.sums(input=[mean_a0, mean_a1])
     """
     helper = LayerHelper('sum', **locals())
     if out is None:
@@ -54,7 +86,25 @@ def sums(input, out=None, main_program=None, startup_program=None):
     return out
 
 
-def assign(input, output, main_program=None, startup_program=None):
+def assign(input, output):
+    """
+    **Assign**
+
+    This function copies the *input* Variable to the *output* Variable.
+
+    Args:
+        input(Variable): The source variable
+        output(Variable): The destination variable
+
+    Returns:
+        Variable: The destination variable that was supplied as the *output*.
+
+    Examples:
+        .. code-block:: python
+          out = fluid.layers.create_tensor(dtype='float32')
+          hidden = fluid.layers.fc(input=data, size=10)
+          fluid.layers.assign(hidden, out)
+    """
     helper = LayerHelper('assign', **locals())
     helper.append_op(
         type='scale',
@@ -64,16 +114,28 @@ def assign(input, output, main_program=None, startup_program=None):
     return output
 
 
-def fill_constant(shape,
-                  dtype,
-                  value,
-                  out=None,
-                  main_program=None,
-                  startup_program=None):
+def fill_constant(shape, dtype, value, out=None):
     """
-    This function creates a tensor , with shape as mentioned in the input and
-    specified dtype and fills this up with a constant value that
-    comes in the input. It also sets the stop_gradient to be True.
+    **fill_constant**
+
+    This function creates a tensor of specified *shape* and
+    *dtype*, and initializes this with a constant supplied in *value*.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+        value(float): Constant value to initialize the output tensor
+        out(Variable): Output Variable to initialize
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64')
     """
     helper = LayerHelper("fill_constant", **locals())
     if out is None:
@@ -94,9 +156,32 @@ def fill_constant_batch_size_like(input,
                                   dtype,
                                   value,
                                   input_dim_idx=0,
-                                  output_dim_idx=0,
-                                  main_program=None,
-                                  startup_program=None):
+                                  output_dim_idx=0):
+    """
+    **fill_constant_batch_size_like**
+
+    This function creates a tensor of specified *shape*, *dtype* and batch size,
+    and initializes this with a constant supplied in *value*. The batch size is
+    obtained from the `input` tensor.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        input(Variable): Tensor whose dimensions will be used to get batch size
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+        value(float): Constant value to initialize the output tensor
+        input_dim_idx(int): Index of input's batch size dimension
+        output_dim_idx(int): Index of output's batch size dimension
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64')
+    """
     helper = LayerHelper("fill_constant_batch_size_like", **locals())
     out = helper.create_tmp_variable(dtype=dtype)
     helper.append_op(
@@ -114,7 +199,7 @@ def fill_constant_batch_size_like(input,
     return out
 
 
-def ones(shape, dtype, main_program=None):
+def ones(shape, dtype):
     """
     This function performs the same function as fill_constant() declared above
     with the constant value being 1.0.
@@ -122,7 +207,7 @@ def ones(shape, dtype, main_program=None):
     return fill_constant(value=1.0, **locals())
 
 
-def zeros(shape, dtype, main_program=None):
+def zeros(shape, dtype):
     """
     This function performs the same function as fill_constant() declared above
     with the constant value being 0.0.
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py
index 7ef524318e637604cc22ba9d8d7cafe1b7505261..54886a8f2cc63474fe82290c0a12771b4cbdba72 100644
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -10,25 +10,19 @@ def simple_img_conv_pool(input,
                          pool_stride,
                          act,
                          param_attr=None,
-                         pool_type='max',
-                         main_program=None,
-                         startup_program=None):
+                         pool_type='max'):
     conv_out = layers.conv2d(
         input=input,
         num_filters=num_filters,
         filter_size=filter_size,
         param_attr=param_attr,
-        act=act,
-        main_program=main_program,
-        startup_program=startup_program)
+        act=act)
 
     pool_out = layers.pool2d(
         input=conv_out,
         pool_size=pool_size,
         pool_type=pool_type,
-        pool_stride=pool_stride,
-        main_program=main_program,
-        startup_program=startup_program)
+        pool_stride=pool_stride)
     return pool_out
 
 
@@ -42,9 +36,7 @@ def img_conv_group(input,
                    conv_with_batchnorm=False,
                    conv_batchnorm_drop_rate=None,
                    pool_stride=1,
-                   pool_type=None,
-                   main_program=None,
-                   startup_program=None):
+                   pool_type=None):
     """
     Image Convolution Group, Used for vgg net.
     """
@@ -75,31 +67,19 @@ def img_conv_group(input,
             filter_size=conv_filter_size[i],
             padding=conv_padding[i],
             param_attr=param_attr[i],
-            act=local_conv_act,
-            main_program=main_program,
-            startup_program=startup_program)
+            act=local_conv_act)
 
         if conv_with_batchnorm[i]:
-            tmp = layers.batch_norm(
-                input=tmp,
-                act=conv_act,
-                main_program=main_program,
-                startup_program=startup_program)
+            tmp = layers.batch_norm(input=tmp, act=conv_act)
             drop_rate = conv_batchnorm_drop_rate[i]
             if abs(drop_rate) > 1e-5:
-                tmp = layers.dropout(
-                    x=tmp,
-                    dropout_prob=drop_rate,
-                    main_program=main_program,
-                    startup_program=startup_program)
+                tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
 
     pool_out = layers.pool2d(
         input=tmp,
         pool_size=pool_size,
         pool_type=pool_type,
-        pool_stride=pool_stride,
-        main_program=main_program,
-        startup_program=startup_program)
+        pool_stride=pool_stride)
     return pool_out
 
 
@@ -108,21 +88,13 @@ def sequence_conv_pool(input,
                        filter_size,
                        param_attr=None,
                        act="sigmoid",
-                       pool_type="max",
-                       main_program=None,
-                       startup_program=None):
+                       pool_type="max"):
     conv_out = layers.sequence_conv(
         input=input,
         num_filters=num_filters,
         filter_size=filter_size,
         param_attr=param_attr,
-        act=act,
-        main_program=main_program,
-        startup_program=startup_program)
+        act=act)
 
-    pool_out = layers.sequence_pool(
-        input=conv_out,
-        pool_type=pool_type,
-        main_program=main_program,
-        startup_program=startup_program)
+    pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type)
     return pool_out
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py
index bbdfab2df9519b77e5df184c00aadf703ec765e0..ff3e5315a2c2b115e4ba563f60de4139f248e93a 100644
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -1,11 +1,12 @@
 from collections import defaultdict
 
 import framework
-from backward import append_backward_ops
-from framework import unique_name
+from backward import append_backward
+from framework import unique_name, program_guard
 from initializer import Constant
 from layer_helper import LayerHelper
 from regularizer import append_regularization_ops
+from clip import append_gradient_clip_ops
 
 __all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad']
 
@@ -159,34 +160,32 @@ class Optimizer(object):
 
         # Create any accumulators
         program = loss.block.program
-        self.helper = LayerHelper(
-            self.__class__.__name__,
-            main_program=program,
-            startup_program=startup_program)
-        self._create_accumulators(loss.block,
-                                  [p[0] for p in parameters_and_grads])
-
-        optimize_ops = []
-        for param_and_grad in parameters_and_grads:
-            if param_and_grad[0].trainable is True and param_and_grad[
-                    1] is not None:
-                optimize_op = self._append_optimize_op(loss.block,
-                                                       param_and_grad)
-                optimize_ops.append(optimize_op)
-
-        # Returned list of ops can include more ops in addition
-        # to optimization ops
-        return_ops = optimize_ops
-
-        # Get custom finish ops for subclasses
-        # FIXME: Need to fix this once we figure out how to handle dependencies
-        finish_ops = self._finish_update(loss.block)
-        if finish_ops is not None:
-            return_ops += finish_ops
-
-        if self._global_step is not None:
-            return_ops.append(self._increment_global_step(loss.block))
-        return return_ops
+        with program_guard(program, startup_program):
+            self.helper = LayerHelper(self.__class__.__name__)
+            self._create_accumulators(loss.block,
+                                      [p[0] for p in parameters_and_grads])
+
+            optimize_ops = []
+            for param_and_grad in parameters_and_grads:
+                if param_and_grad[0].trainable is True and param_and_grad[
+                        1] is not None:
+                    optimize_op = self._append_optimize_op(loss.block,
+                                                           param_and_grad)
+                    optimize_ops.append(optimize_op)
+
+            # Returned list of ops can include more ops in addition
+            # to optimization ops
+            return_ops = optimize_ops
+
+            # Get custom finish ops for subclasses
+            # FIXME: Need to fix this once we figure out how to handle dependencies
+            finish_ops = self._finish_update(loss.block)
+            if finish_ops is not None:
+                return_ops += finish_ops
+
+            if self._global_step is not None:
+                return_ops.append(self._increment_global_step(loss.block))
+            return return_ops
 
     def minimize(self,
                  loss,
@@ -195,16 +194,20 @@ class Optimizer(object):
                  no_grad_set=None):
         """Add operations to minimize `loss` by updating `parameter_list`.
 
-        This method combines interface `append_backward_ops()` and
+        This method combines interface `append_backward()` and
         `create_optimization_pass()` into one.
         """
-        params_grads = append_backward_ops(loss, parameter_list, no_grad_set)
+        params_grads = append_backward(loss, parameter_list, no_grad_set)
+
+        params_grads = append_gradient_clip_ops(params_grads)
+
         # Add regularization if any
         params_grads = append_regularization_ops(params_grads,
                                                  self.regularization)
+
         optimize_ops = self.create_optimization_pass(params_grads, loss,
                                                      startup_program)
-        return optimize_ops
+        return optimize_ops, params_grads
 
 
 class SGDOptimizer(Optimizer):
diff --git a/python/paddle/v2/fluid/param_attr.py b/python/paddle/v2/fluid/param_attr.py
index 7952a5ea51c00f72664443fb26faa455e89da7be..ab4561b0423dd73c8c0d529cbf34b52876b1b77c 100644
--- a/python/paddle/v2/fluid/param_attr.py
+++ b/python/paddle/v2/fluid/param_attr.py
@@ -1,6 +1,8 @@
 from initializer import Initializer, Xavier, Constant
 from regularizer import WeightDecayRegularizer
 
+__all__ = ['ParamAttr']
+
 
 class ParamAttr(object):
     def __init__(self,
@@ -8,12 +10,14 @@ class ParamAttr(object):
                  initializer=None,
                  learning_rate=1.0,
                  regularizer=None,
-                 trainable=True):
+                 trainable=True,
+                 clip=None):
         self.name = name
         self.initializer = initializer
         self.learning_rate = learning_rate
         self.regularizer = regularizer
         self.trainable = trainable
+        self.clip = clip
 
     def set_default_initializer(self, initializer):
         if initializer is None:
@@ -54,9 +58,12 @@ class ParamAttr(object):
     def to_kwargs(self, with_initializer=False):
         kwargs = {
             'name': self.name,
-            'learning_rate': self.learning_rate,
+            'optimize_attr': {
+                'learning_rate': self.learning_rate
+            },
             'regularizer': self.regularizer,
-            'trainable': self.trainable
+            'trainable': self.trainable,
+            'clip_attr': self.clip
         }
         if with_initializer:
             kwargs['initializer'] = self.initializer
diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py
index 2069b713faf41c5c00ceaf47e030864b98c678da..dcecd76224e70d03ed987a5bb104a977a527d218 100644
--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
@@ -1,5 +1,6 @@
 import paddle.v2.fluid.core as core
 from contextlib import contextmanager
+import os
 
 __all__ = ['CudaProfiler']
 
@@ -30,17 +31,21 @@ def cuda_profiler(output_file, output_mode=None, config=None):
             written into this file.
         output_mode (string) : The output mode has Key-Value pair format and
             Comma separated values format. It should be 'kvp' or 'csv'.
-        config (string) : The profiler options and counters can refer to
-            "Compute Command Line Profiler User Guide".
+        config (list of string) : The profiler options and counters can refer
+            to "Compute Command Line Profiler User Guide".
     """
     if output_mode is None:
         output_mode = 'csv'
     if output_mode not in ['kvp', 'csv']:
         raise ValueError("The output mode must be 'kvp' or 'csv'.")
     config = NVPROF_CONFIG if config is None else config
-    core.nvprof_init(output_file, output_mode, config)
+    config_file = 'nvprof_config_file'
+    with open(config_file, 'wb') as fp:
+        fp.writelines(["%s\n" % item for item in config])
+    core.nvprof_init(output_file, output_mode, config_file)
     # Enables profiler collection by the active CUDA profiling tool.
     core.nvprof_start()
     yield
     # Disables profiler collection.
     core.nvprof_stop()
+    os.remove(config_file)
diff --git a/python/paddle/v2/fluid/registry.py b/python/paddle/v2/fluid/registry.py
index 6f5dd365ded628ad49800f0a04f208ec49cca4c5..7aa82906114b355277185211134bb791e5dc43f9 100644
--- a/python/paddle/v2/fluid/registry.py
+++ b/python/paddle/v2/fluid/registry.py
@@ -8,7 +8,7 @@ import proto.framework_pb2 as framework_pb2
 from framework import OpProtoHolder, Variable, Program, Operator
 from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
 
-__all__ = ['deprecated', 'register_layer']
+__all__ = ['deprecated', 'register_layer', 'autodoc']
 
 
 def _convert_(name):
@@ -175,12 +175,18 @@ def deprecated(func_or_class):
         """
         Wrap func with deprecated warning
         """
-        warnings.simplefilter('always', DeprecationWarning)  #turn off filter
+        warnings.simplefilter('always', DeprecationWarning)  # turn off filter
         warnings.warn(
             "Call to deprecated function {}.".format(func.__name__),
             category=DeprecationWarning,
             stacklevel=2)
-        warnings.simplefilter('default', DeprecationWarning)  #reset filter
+        warnings.simplefilter('default', DeprecationWarning)  # reset filter
         return func(*args, **kwargs)
 
     return func_wrapper
+
+
+def autodoc(func):
+    func.__doc__ = _generate_doc_string_(OpProtoHolder.instance().get_op_proto(
+        func.__name__))
+    return func
diff --git a/python/paddle/v2/fluid/tests/.gitignore b/python/paddle/v2/fluid/tests/.gitignore
index a648f2b387c2c7b9422eea6749e43e7b8871f60f..62f82151eb42342cd90657b1e4dfc93410950e62 100644
--- a/python/paddle/v2/fluid/tests/.gitignore
+++ b/python/paddle/v2/fluid/tests/.gitignore
@@ -1,3 +1,4 @@
 image/
 fit_a_line.model/
 tmp
+cuda_profiler.txt
diff --git a/python/paddle/v2/fluid/tests/__init__.py b/python/paddle/v2/fluid/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
index d2693b602ea5de9d2d60fbe114820b25119bfa3f..c3591a613acafb268a5bd70618cd4555450bac29 100644
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -150,7 +150,7 @@ def main():
     crf_decode = fluid.layers.crf_decoding(
         input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
 
-    precision, recall, f1_score = fluid.layers.chunk_eval(
+    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
         input=crf_decode,
         label=target,
         chunk_scheme="IOB",
@@ -176,20 +176,21 @@ def main():
 
     batch_id = 0
     for pass_id in xrange(PASS_NUM):
+        chunk_evaluator.reset(exe)
         for data in train_data():
-            outs = exe.run(fluid.default_main_program(),
-                           feed=feeder.feed(data),
-                           fetch_list=[avg_cost, precision, recall, f1_score])
-            avg_cost_val = np.array(outs[0])
-            precision_val = np.array(outs[1])
-            recall_val = np.array(outs[2])
-            f1_score_val = np.array(outs[3])
+            cost, precision, recall, f1_score = exe.run(
+                fluid.default_main_program(),
+                feed=feeder.feed(data),
+                fetch_list=[avg_cost] + chunk_evaluator.metrics)
+            pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
+                exe)
 
             if batch_id % 10 == 0:
-                print("avg_cost=" + str(avg_cost_val))
-                print("precision_val=" + str(precision_val))
-                print("recall_val:" + str(recall_val))
-                print("f1_score_val:" + str(f1_score_val))
+                print("avg_cost:" + str(cost) + " precision:" + str(
+                    precision) + " recall:" + str(recall) + " f1_score:" + str(
+                        f1_score) + " pass_precision:" + str(
+                            pass_precision) + " pass_recall:" + str(pass_recall)
+                      + " pass_f1_score:" + str(pass_f1_score))
 
             # exit early for CI
             exit(0)
diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
index 80ffc5a544c201ed45a6de46b5a2addff82246b7..e79864b3977ed8111903f9497685ee7ebf76e1da 100644
--- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
@@ -33,7 +33,7 @@ def encoder_decoder():
 
     fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
     lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
-    encoder_out = layers.sequence_pool(input=lstm_hidden0, pool_type="last")
+    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
 
     # decoder
     trg_language_word = layers.data(
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
index 4dc2c50e1c963a189b727f0a7edcb6886abd9038..fc073f6be8563a363c0f98b9235ae267fa68562d 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
@@ -11,7 +11,9 @@ regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
 hidden1 = fluid.layers.fc(input=image,
                           size=128,
                           act='relu',
-                          param_attr=regularizer)
+                          param_attr=fluid.ParamAttr(
+                              regularizer=regularizer,
+                              clip=fluid.clip.ClipByValue(10)))
 hidden2 = fluid.layers.fc(input=hidden1,
                           size=64,
                           act='relu',
@@ -33,11 +35,10 @@ opts = optimizer.minimize(avg_cost)
 accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
 
 inference_program = fluid.default_main_program().clone()
-test_accuracy = fluid.evaluator.Accuracy(
-    input=predict, label=label, main_program=inference_program)
-test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
-inference_program = fluid.io.get_inference_program(
-    test_target, main_program=inference_program)
+with fluid.program_guard(inference_program):
+    test_accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
+    inference_program = fluid.io.get_inference_program(test_target)
 
 train_reader = paddle.batch(
     paddle.reader.shuffle(
diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
index db91ca4f9c7d17fb51fc5d65a0464e976d98523c..e3cc2a89371233014dec4ba3d730a866722d3eae 100644
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -125,10 +125,11 @@ def model():
 
     # need cos sim
     inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
+    scale_infer = layers.scale(x=inference, scale=5.0)
 
     label = layers.data(name='score', shape=[1], dtype='float32')
 
-    square_cost = layers.square_error_cost(input=inference, label=label)
+    square_cost = layers.square_error_cost(input=scale_infer, label=label)
 
     avg_cost = layers.mean(x=square_cost)
 
@@ -141,7 +142,7 @@ def main():
     opts = sgd_optimizer.minimize(cost)
 
     if USE_GPU:
-        place = core.GPUPlace(0)
+        place = core.CUDAPlace(0)
     else:
         place = core.CPUPlace()
 
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
index c0b051f862f245b020a872b0a32fa4b560d1d574..633de66bea2af7404ab0d325b425e7b9e63d3e43 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
@@ -4,12 +4,7 @@ import paddle.v2.fluid as fluid
 from paddle.v2.fluid.layer_helper import LayerHelper
 
 
-def lstm(x,
-         c_pre_init,
-         hidden_dim,
-         forget_bias=None,
-         main_program=None,
-         startup_program=None):
+def lstm(x, c_pre_init, hidden_dim, forget_bias=None):
     """
     This function helps create an operator for the LSTM (Long Short Term
     Memory) cell that can be used inside an RNN.
@@ -20,15 +15,8 @@ def lstm(x,
         c_pre = rnn.memory(init=c_pre_init)
         x_t = rnn.step_input(x)
 
-        before_fc = fluid.layers.concat(
-            input=[x_t, c_pre],
-            axis=1,
-            main_program=main_program,
-            startup_program=startup_program)
-        after_fc = fluid.layers.fc(input=before_fc,
-                                   size=hidden_dim * 4,
-                                   main_program=main_program,
-                                   startup_program=startup_program)
+        before_fc = fluid.layers.concat(input=[x_t, c_pre], axis=1)
+        after_fc = fluid.layers.fc(input=before_fc, size=hidden_dim * 4)
 
         dtype = x.dtype
         c = helper.create_tmp_variable(dtype)
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..20b4a8b34cd085ae51e6169f0d4eac58b7f3ffb2
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
@@ -0,0 +1,80 @@
+from __future__ import print_function
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import os
+
+images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    input=images,
+    filter_size=5,
+    num_filters=20,
+    pool_size=2,
+    pool_stride=2,
+    act="relu")
+conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    input=conv_pool_1,
+    filter_size=5,
+    num_filters=50,
+    pool_size=2,
+    pool_stride=2,
+    act="relu")
+
+predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+BATCH_SIZE = 50
+PASS_NUM = 3
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+t = fluid.DistributeTranspiler()
+# all parameter server endpoints list for spliting parameters
+pserver_endpoints = os.getenv("PSERVERS")
+# server endpoint for current node
+current_endpoint = os.getenv("SERVER_ENDPOINT")
+# run as trainer or parameter server
+training_role = os.getenv("TRAINING_ROLE",
+                          "TRAINER")  # get the training role: trainer/pserver
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+if training_role == "PSERVER":
+    if not current_endpoint:
+        print("need env SERVER_ENDPOINT")
+        exit(1)
+    pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+    exe.run(fluid.default_startup_program())
+    exe.run(pserver_prog)
+elif training_role == "TRAINER":
+    trainer_prog = t.get_trainer_program()
+    feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in range(PASS_NUM):
+        accuracy.reset(exe)
+        batch_id = 0
+        for data in train_reader():
+            loss, acc = exe.run(trainer_prog,
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+            pass_acc = accuracy.eval(exe)
+            if batch_id % 100 == 0:
+                print("batch_id %d, loss: %f, acc: %f" %
+                      (batch_id, loss, pass_acc))
+            batch_id += 1
+
+        pass_acc = accuracy.eval(exe)
+        print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
+else:
+    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py
index e83c4a0622013cbfebdf39434ef252412697acb1..b77d2b1268f27c5ec3c34839aaad9b75f0132c2e 100644
--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -4,7 +4,7 @@ import random
 import itertools
 import paddle.v2.fluid.core as core
 import collections
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 from paddle.v2.fluid.op import Operator
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.framework import Program, OpProtoHolder
@@ -90,12 +90,10 @@ def get_numeric_gradient(scope,
     def product(dim):
         return reduce(lambda a, b: a * b, dim, 1)
 
-    ctx = core.DeviceContext.create(core.CPUPlace())
-
     def get_output():
         sum = []
         for output_name in output_names:
-            op.run(scope, ctx)
+            op.run(scope, core.CPUPlace())
             sum.append(
                 np.array(scope.find_var(output_name).get_tensor()).mean())
         return np.array(sum).mean()
@@ -318,7 +316,7 @@ class OpTest(unittest.TestCase):
     def check_output(self, atol=1e-5):
         places = [core.CPUPlace()]
         if core.is_compile_gpu() and core.op_support_gpu(self.op_type):
-            places.append(core.GPUPlace(0))
+            places.append(core.CUDAPlace(0))
         for place in places:
             self.check_output_with_place(place, atol)
 
@@ -381,7 +379,7 @@ class OpTest(unittest.TestCase):
                                "Gradient Check On %s" % str(cpu_place))
 
         if core.is_compile_gpu() and self.op.support_gpu():
-            gpu_place = core.GPUPlace(0)
+            gpu_place = core.CUDAPlace(0)
             gpu_analytic_grads = self._get_gradient(inputs_to_check, gpu_place,
                                                     output_names, no_grad_set)
 
@@ -493,7 +491,7 @@ class OpTest(unittest.TestCase):
             op_loss.desc.infer_var_type(block.desc)
             op_loss.desc.infer_shape(block.desc)
 
-        param_grad_list = append_backward_ops(
+        param_grad_list = append_backward(
             loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set)
 
         feed_dict = {
diff --git a/python/paddle/v2/fluid/tests/test_activation_op.py b/python/paddle/v2/fluid/tests/test_activation_op.py
index b052374dc7ec3c5684d6adfda6b9d000c5e19fe0..03eb7deb9a35933e5a1676a262a371c69151e6d1 100644
--- a/python/paddle/v2/fluid/tests/test_activation_op.py
+++ b/python/paddle/v2/fluid/tests/test_activation_op.py
@@ -10,13 +10,13 @@ class TestExp(OpTest):
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
         }
-        self.outputs = {'Y': np.exp(self.inputs['X'])}
+        self.outputs = {'Out': np.exp(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestSigmoid(OpTest):
@@ -25,13 +25,13 @@ class TestSigmoid(OpTest):
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
         }
-        self.outputs = {'Y': 1 / (1 + np.exp(-self.inputs['X']))}
+        self.outputs = {'Out': 1 / (1 + np.exp(-self.inputs['X']))}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.008)
+        self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
 
 class TestLogSigmoid(OpTest):
@@ -40,13 +40,13 @@ class TestLogSigmoid(OpTest):
         self.inputs = {
             'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
         }
-        self.outputs = {'Y': np.log(1 / (1 + np.exp(-self.inputs['X'])))}
+        self.outputs = {'Out': np.log(1 / (1 + np.exp(-self.inputs['X'])))}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.008)
+        self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
 
 class TestTanh(OpTest):
@@ -55,13 +55,13 @@ class TestTanh(OpTest):
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
         }
-        self.outputs = {'Y': np.tanh(self.inputs['X'])}
+        self.outputs = {'Out': np.tanh(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestTanhShrink(OpTest):
@@ -70,13 +70,13 @@ class TestTanhShrink(OpTest):
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [10, 17]).astype("float32")
         }
-        self.outputs = {'Y': self.inputs['X'] - np.tanh(self.inputs['X'])}
+        self.outputs = {'Out': self.inputs['X'] - np.tanh(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.008)
+        self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
 
 class TestHardShrink(OpTest):
@@ -90,13 +90,13 @@ class TestHardShrink(OpTest):
 
         t = np.copy(x)
         t[(t >= -threshold) & (t <= threshold)] = 0
-        self.outputs = {'Y': t}
+        self.outputs = {'Out': t}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.005)
+        self.check_grad(['X'], 'Out', max_relative_error=0.005)
 
 
 class TestSoftShrink(OpTest):
@@ -110,13 +110,13 @@ class TestSoftShrink(OpTest):
         y = np.copy(self.inputs['X'])
         y = (y < -lambda_val) * (y + lambda_val) + (y > lambda_val) * (
             y - lambda_val)
-        self.outputs = {'Y': y}
+        self.outputs = {'Out': y}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestSqrt(OpTest):
@@ -125,13 +125,13 @@ class TestSqrt(OpTest):
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
         }
-        self.outputs = {'Y': np.sqrt(self.inputs['X'])}
+        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestAbs(OpTest):
@@ -144,13 +144,13 @@ class TestAbs(OpTest):
         # we should avoid this
         x[np.abs(x) < 0.005] = 0.02
         self.inputs = {'X': x}
-        self.outputs = {'Y': np.abs(self.inputs['X'])}
+        self.outputs = {'Out': np.abs(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestCeil(OpTest):
@@ -158,13 +158,13 @@ class TestCeil(OpTest):
         self.op_type = "ceil"
         x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
         self.inputs = {'X': x}
-        self.outputs = {'Y': np.ceil(self.inputs['X'])}
+        self.outputs = {'Out': np.ceil(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestFloor(OpTest):
@@ -173,13 +173,13 @@ class TestFloor(OpTest):
         x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
         self.inputs = {'X': x}
         # numpy floor need +1
-        self.outputs = {'Y': np.floor(self.inputs['X']) + 1.0}
+        self.outputs = {'Out': np.floor(self.inputs['X']) + 1.0}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestRound(OpTest):
@@ -187,13 +187,13 @@ class TestRound(OpTest):
         self.op_type = "round"
         x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
         self.inputs = {'X': x}
-        self.outputs = {'Y': np.round(self.inputs['X'])}
+        self.outputs = {'Out': np.round(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestRelu(OpTest):
@@ -203,13 +203,13 @@ class TestRelu(OpTest):
         # The same reason with TestAbs
         x[np.abs(x) < 0.005] = 0.02
         self.inputs = {'X': x}
-        self.outputs = {'Y': np.maximum(self.inputs['X'], 0)}
+        self.outputs = {'Out': np.maximum(self.inputs['X'], 0)}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestBRelu(OpTest):
@@ -227,13 +227,13 @@ class TestBRelu(OpTest):
         t = np.copy(x)
         t[t < t_min] = t_min
         t[t > t_max] = t_max
-        self.outputs = {'Y': t}
+        self.outputs = {'Out': t}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.02)
+        self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
 
 class TestRelu6(OpTest):
@@ -248,14 +248,14 @@ class TestRelu6(OpTest):
         self.inputs = {'X': x}
         self.attrs = {'threshold': threshold}
         self.outputs = {
-            'Y': np.minimum(np.maximum(self.inputs['X'], 0), threshold)
+            'Out': np.minimum(np.maximum(self.inputs['X'], 0), threshold)
         }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.02)
+        self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
 
 class TestSoftRelu(OpTest):
@@ -271,13 +271,13 @@ class TestSoftRelu(OpTest):
         t = np.copy(x)
         t[t < -threshold] = -threshold
         t[t > threshold] = threshold
-        self.outputs = {'Y': np.log((np.exp(t) + 1))}
+        self.outputs = {'Out': np.log((np.exp(t) + 1))}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.02)
+        self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
 
 class TestELU(OpTest):
@@ -290,27 +290,27 @@ class TestELU(OpTest):
         self.inputs = {'X': x}
         self.attrs = {'alpha': alpha}
         self.outputs = {
-            'Y': np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
+            'Out': np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
         }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.02)
+        self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
 
 class TestReciprocal(OpTest):
     def setUp(self):
         self.op_type = "reciprocal"
         self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
-        self.outputs = {'Y': np.reciprocal(self.inputs['X'])}
+        self.outputs = {'Out': np.reciprocal(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.01)
+        self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
 class TestLog(OpTest):
@@ -319,13 +319,13 @@ class TestLog(OpTest):
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
         }
-        self.outputs = {'Y': np.log(self.inputs['X'])}
+        self.outputs = {'Out': np.log(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestSquare(OpTest):
@@ -334,13 +334,13 @@ class TestSquare(OpTest):
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
         }
-        self.outputs = {'Y': np.square(self.inputs['X'])}
+        self.outputs = {'Out': np.square(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestPow(OpTest):
@@ -348,13 +348,13 @@ class TestPow(OpTest):
         self.op_type = "pow"
         self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
         self.attrs = {'factor': 3.0}
-        self.outputs = {'Y': np.power(self.inputs['X'], 3)}
+        self.outputs = {'Out': np.power(self.inputs['X'], 3)}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.02)
+        self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
 
 class TestSTanh(OpTest):
@@ -366,13 +366,13 @@ class TestSTanh(OpTest):
         scale_a = 2.0 / 3.0
         scale_b = 1.7159
         self.attrs = {'scale_a': scale_a, 'scale_b': scale_b}
-        self.outputs = {'Y': scale_b * np.tanh(self.inputs['X'] * scale_a)}
+        self.outputs = {'Out': scale_b * np.tanh(self.inputs['X'] * scale_a)}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestSoftplus(OpTest):
@@ -381,13 +381,13 @@ class TestSoftplus(OpTest):
         self.inputs = {
             'X': np.random.uniform(-1, 1, [11, 17]).astype("float64")
         }
-        self.outputs = {'Y': np.log(1 + np.exp(self.inputs['X']))}
+        self.outputs = {'Out': np.log(1 + np.exp(self.inputs['X']))}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestSoftsign(OpTest):
@@ -397,14 +397,14 @@ class TestSoftsign(OpTest):
             'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
         }
         self.outputs = {
-            'Y': np.divide(self.inputs['X'], 1 + np.abs(self.inputs['X']))
+            'Out': np.divide(self.inputs['X'], 1 + np.abs(self.inputs['X']))
         }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestThresholdedRelu(OpTest):
@@ -419,13 +419,13 @@ class TestThresholdedRelu(OpTest):
 
         self.inputs = {'X': X}
         self.attrs = {'threshold': threshold}
-        self.outputs = {'Y': (X > threshold) * X}
+        self.outputs = {'Out': (X > threshold) * X}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=self.relative_error)
+        self.check_grad(['X'], 'Out', max_relative_error=self.relative_error)
 
 
 class TestHardSigmoid(OpTest):
@@ -447,13 +447,13 @@ class TestHardSigmoid(OpTest):
             upper_threshold - 0.2
 
         temp = X * slope + offset
-        self.outputs = {'Y': np.maximum(0.0, np.minimum(1.0, temp))}
+        self.outputs = {'Out': np.maximum(0.0, np.minimum(1.0, temp))}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.002)
+        self.check_grad(['X'], 'Out', max_relative_error=0.002)
 
 
 class TestSwish(OpTest):
@@ -462,13 +462,13 @@ class TestSwish(OpTest):
         X = np.random.uniform(0.1, 1, [11, 17]).astype("float32")
         self.inputs = {'X': X}
         self.attrs = {'beta': 2.3}
-        self.outputs = {'Y': X * expit(self.attrs['beta'] * X)}
+        self.outputs = {'Out': X * expit(self.attrs['beta'] * X)}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.008)
+        self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/v2/fluid/tests/test_adagrad_op.py b/python/paddle/v2/fluid/tests/test_adagrad_op.py
index 903e84c32887100bbeef6ebf81f66f06f084fab5..7b2d02fbf4256d2c27383a3452d526271af543a3 100644
--- a/python/paddle/v2/fluid/tests/test_adagrad_op.py
+++ b/python/paddle/v2/fluid/tests/test_adagrad_op.py
@@ -113,8 +113,7 @@ class TestSparseAdagradOp(unittest.TestCase):
             LearningRate='LearningRate',
             epsilon=2.0)
 
-        ctx = core.DeviceContext.create(place)
-        adagrad_op.run(scope, ctx)
+        adagrad_op.run(scope, place)
 
         # get and compare moment result
         moment_result_array = np.array(moment)
@@ -168,7 +167,7 @@ class TestSparseAdagradOp(unittest.TestCase):
     def test_sparse_adagrad(self):
         places = [core.CPUPlace()]
         if core.is_compile_gpu():
-            places.append(core.GPUPlace(0))
+            places.append(core.CUDAPlace(0))
         for place in places:
             self.check_with_place(place)
 
diff --git a/python/paddle/v2/fluid/tests/test_array_read_write_op.py b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
index f6120aedecf1015c279b8f218f5e37f2e598ab91..01321de8eac34d562d99726b1f4125d1932ab40f 100644
--- a/python/paddle/v2/fluid/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
@@ -2,7 +2,7 @@ import unittest
 import paddle.v2.fluid.core as core
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 from paddle.v2.fluid.framework import default_main_program
 import numpy
 
@@ -64,7 +64,7 @@ class TestArrayReadWrite(unittest.TestCase):
         total_sum = layers.sums(input=[a_sum, x_sum])
         total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)
 
-        append_backward_ops(total_sum_scaled)
+        append_backward(total_sum_scaled)
 
         g_vars = map(default_main_program().global_block().var,
                      [each_x.name + "@GRAD" for each_x in x])
diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
index e766a68c0e338b07e47260e40edc544c98555382..abbd48d2b843cedb77caffc13413d2f9695defa6 100644
--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -3,10 +3,7 @@ import numpy as np
 from op_test import OpTest
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.op import Operator
-
-
-def grad_var_name(var_name):
-    return var_name + "@GRAD"
+from paddle.v2.fluid.framework import grad_var_name
 
 
 def get_backward_op(scope, op, no_grad_set):
@@ -211,7 +208,7 @@ class TestBatchNormOp(OpTest):
         print 'python: NHWC, NCHW, backward checking passed'
 
     def test_forward_backward(self):
-        def test_with_place(place, tensor_format, shape):
+        def test_with_place(place, data_layout, shape):
             # attr
             epsilon = 0.00001
             momentum = 0.9
@@ -295,12 +292,11 @@ class TestBatchNormOp(OpTest):
                 SavedVariance="saved_variance",
                 # attrs
                 is_test=False,
-                tensor_format=tensor_format,
+                data_layout=data_layout,
                 momentum=momentum,
                 epsilon=epsilon)
 
-            ctx = core.DeviceContext.create(place)
-            batch_norm_op.run(scope, ctx)
+            batch_norm_op.run(scope, place)
 
             # check forward result
             self.__assert_close(y_tensor, y_out, "y_out")
@@ -308,13 +304,13 @@ class TestBatchNormOp(OpTest):
             self.__assert_close(saved_variance_tensor, saved_variance,
                                 "saved_variance")
             self.__assert_close(mean_out_tensor, mean_out, "mean_out")
-            if isinstance(place, core.GPUPlace):
+            if isinstance(place, core.CUDAPlace):
                 atol = 5e-2
             else:
                 atol = 1e-4
             self.__assert_close(variance_out_tensor, variance_out,
                                 "variance_out", atol)
-            print "op test forward passed: ", str(place), tensor_format
+            print "op test forward passed: ", str(place), data_layout
 
             # run backward
             batch_norm_op_grad = get_backward_op(scope, batch_norm_op, set())
@@ -323,7 +319,7 @@ class TestBatchNormOp(OpTest):
                 ["y_out", "mean", "variance", "saved_mean", "saved_variance"],
                 place,
                 feed_dict={"y_out": y_grad})
-            batch_norm_op_grad.run(scope, ctx)
+            batch_norm_op_grad.run(scope, place)
 
             x_grad_tensor = create_or_get_tensor(scope,
                                                  grad_var_name("x_val"), None,
@@ -339,11 +335,15 @@ class TestBatchNormOp(OpTest):
             self.__assert_close(x_grad_tensor, x_grad_ref, "x_grad")
             self.__assert_close(scale_grad_tensor, scale_grad_ref, "scale_grad")
             self.__assert_close(bias_grad_tensor, bias_grad_ref, "bias_grad")
-            print "op test backward passed: ", str(place), tensor_format
+            print "op test backward passed: ", str(place), data_layout
 
         places = [core.CPUPlace()]
         if core.is_compile_gpu() and core.op_support_gpu("batch_norm"):
-            places.append(core.GPUPlace(0))
+            places.append(core.CUDAPlace(0))
+
+            core.init_devices(["CPU", "GPU:0"])
+        else:
+            core.init_devices(["CPU"])
         for place in places:
             for data_format in ["NCHW", "NHWC"]:
                 test_with_place(place, data_format, [2, 3, 4, 5])
diff --git a/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
index 5fad7d8cce5af3677aa77dc0abb64f1ecd380419..f329214dce407fe0382c51b29f0f4c33b562541a 100644
--- a/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
+++ b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
@@ -57,8 +57,7 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
             SentenceIds="sentence_ids",
             SentenceScores="sentence_scores")
 
-        ctx = core.DeviceContext.create(self.cpu_place)
-        beam_search_decode_op.run(self.scope, ctx)
+        beam_search_decode_op.run(self.scope, self.cpu_place)
 
         expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
         self.assertEqual(sentence_ids.lod(), expected_lod)
diff --git a/python/paddle/v2/fluid/tests/test_beam_search_op.py b/python/paddle/v2/fluid/tests/test_beam_search_op.py
index cc7c09bb59de3f83e47b4d95c1203f7f050c5132..595f132fa85f0a65f15d9ac31ad320e567c96358 100644
--- a/python/paddle/v2/fluid/tests/test_beam_search_op.py
+++ b/python/paddle/v2/fluid/tests/test_beam_search_op.py
@@ -14,7 +14,6 @@ def create_tensor(scope, name, np_data):
 class BeamSearchOpTester(unittest.TestCase):
     def setUp(self):
         self.scope = core.Scope()
-        self.ctx = core.DeviceContext.create(core.CPUPlace())
         self._create_ids()
         self._create_scores()
         self._create_pre_ids()
@@ -32,7 +31,7 @@ class BeamSearchOpTester(unittest.TestCase):
             level=0,
             beam_size=2,
             end_id=0, )
-        op.run(self.scope, self.ctx)
+        op.run(self.scope, core.CPUPlace())
         selected_ids = self.scope.find_var("selected_ids").get_tensor()
         print 'selected_ids', np.array(selected_ids)
         print 'lod', selected_ids.lod()
diff --git a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
index 819e65a653437f0c34e14403f76317ff3b7f37f4..53bf6f815b8c7baf4c92d9fd488b69722ab0bef5 100644
--- a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
+++ b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
@@ -147,7 +147,13 @@ class TestChunkEvalOp(OpTest):
             'Recall': np.asarray(
                 [recall], dtype='float32'),
             'F1-Score': np.asarray(
-                [f1], dtype='float32')
+                [f1], dtype='float32'),
+            'NumInferChunks': np.asarray(
+                [self.num_infer_chunks], dtype='int64'),
+            'NumLabelChunks': np.asarray(
+                [self.num_label_chunks], dtype='int64'),
+            'NumCorrectChunks': np.asarray(
+                [self.num_correct_chunks], dtype='int64')
         }
 
     def setUp(self):
diff --git a/python/paddle/v2/fluid/tests/test_cond_op.py b/python/paddle/v2/fluid/tests/test_cond_op.py
index 9d1df44b9065f8101e90b87815660f8c0818645f..32e54084e48cf77c569db4dee54a0c89d5108373 100644
--- a/python/paddle/v2/fluid/tests/test_cond_op.py
+++ b/python/paddle/v2/fluid/tests/test_cond_op.py
@@ -65,8 +65,7 @@ class TestCondOp(unittest.TestCase):
         self.create_global_variables()
         self.create_cond_op()
         self.create_sub_net()
-        ctx = core.DeviceContext.create(core.CPUPlace())
-        self.condop.run(self.scope, ctx)
+        self.condop.run(self.scope, core.CPUPlace())
         return np.array(self.scope.find_var("Out").get_tensor())
 
     def create_global_variables(self):
diff --git a/python/paddle/v2/fluid/tests/test_conditional_block.py b/python/paddle/v2/fluid/tests/test_conditional_block.py
index 2b9d8f351a2836cd723d629d4790de1e068d0ea3..7d815123f3454d1457f59202219f9a93bf3d8c31 100644
--- a/python/paddle/v2/fluid/tests/test_conditional_block.py
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
@@ -3,7 +3,7 @@ import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.framework import default_startup_program, default_main_program
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy
 
 
@@ -26,7 +26,7 @@ class ConditionalBlock(unittest.TestCase):
         outs = exe.run(feed={'X': x}, fetch_list=[out])[0]
         print outs
         loss = layers.mean(x=out)
-        append_backward_ops(loss=loss)
+        append_backward(loss=loss)
         outs = exe.run(
             feed={'X': x},
             fetch_list=[
diff --git a/python/paddle/v2/fluid/tests/test_const_value.py b/python/paddle/v2/fluid/tests/test_const_value.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8c17c2c98674fa67458efa090e166e37f5a6a8a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_const_value.py
@@ -0,0 +1,14 @@
+import unittest
+import paddle.v2.fluid.framework as framework
+
+
+class ConditionalBlock(unittest.TestCase):
+    def test_const_value(self):
+        self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD")
+        self.assertEqual(framework.TEMP_VAR_NAME, "@TEMP@")
+        self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD")
+        self.assertEqual(framework.ZERO_VAR_SUFFIX, "@ZERO")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
index d7b1f2f2a3abf6335998742dbbef8e17794170fa..d59537b924d57d40f7d740d99eb814c95f528e5f 100644
--- a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
@@ -3,14 +3,17 @@ import numpy as np
 from op_test import OpTest
 
 
-def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
+def conv2dtranspose_forward_naive(input_, filter_, attrs):
     in_n, in_c, in_h, in_w = input_.shape
     f_c, out_c, f_h, f_w = filter_.shape
     assert in_c == f_c
 
-    stride, pad = conv2dtranspose_param['stride'], conv2dtranspose_param['pad']
-    out_h = (in_h - 1) * stride[0] + f_h
-    out_w = (in_w - 1) * stride[1] + f_w
+    stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
+        'dilations']
+    d_bolck_h = dilations[0] * (f_h - 1) + 1
+    d_bolck_w = dilations[1] * (f_w - 1) + 1
+    out_h = (in_h - 1) * stride[0] + d_bolck_h
+    out_w = (in_w - 1) * stride[1] + d_bolck_w
 
     out = np.zeros((in_n, out_c, out_h, out_w))
 
@@ -23,9 +26,9 @@ def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
 
                 for k in range(out_c):
                     tmp_out = np.sum(input_masked * filter_[:, k, :, :], axis=0)
-                    i1, i2 = i * stride[0], i * stride[0] + f_h
-                    j1, j2 = j * stride[0], j * stride[0] + f_w
-                    out[n, k, i1:i2, j1:j2] += tmp_out
+                    i1, i2 = i * stride[0], i * stride[0] + d_bolck_h
+                    j1, j2 = j * stride[0], j * stride[0] + d_bolck_h
+                    out[n, k, i1:i2:dilations[0], j1:j2:dilations[1]] += tmp_out
 
     out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]]
     return out
@@ -37,11 +40,8 @@ class TestConv2dTransposeOp(OpTest):
         self.init_op_type()
         self.init_test_case()
 
-        conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad}
         input_ = np.random.random(self.input_size).astype("float32")
         filter_ = np.random.random(self.filter_size).astype("float32")
-        output = conv2dtranspose_forward_naive(
-            input_, filter_, conv2dtranspose_param).astype('float32')
 
         self.inputs = {'Input': input_, 'Filter': filter_}
         self.attrs = {
@@ -49,6 +49,10 @@ class TestConv2dTransposeOp(OpTest):
             'paddings': self.pad,
             'dilations': self.dilations
         }
+
+        output = conv2dtranspose_forward_naive(input_, filter_,
+                                               self.attrs).astype('float32')
+
         self.outputs = {'Output': output}
 
     def test_check_output(self):
@@ -104,11 +108,60 @@ class TestWithStride(TestConv2dTransposeOp):
         self.filter_size = [f_c, 6, 3, 3]
 
 
+class TestWithDilation(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+
 # ------------ test_cudnn ------------
 class TestCudnn(TestConv2dTransposeOp):
     def init_op_type(self):
         self.op_type = "conv2d_transpose_cudnn"
 
 
+class TestCudnnWithPad(TestWithPad):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv2d_transpose_cudnn"
+
+
+class TestCudnnWithStride(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv2d_transpose_cudnn"
+
+
+# #cudnn v5 does not support dilation conv.
+# class TestCudnnWithDilation(TestWithDilation):
+#     def init_test_case(self):
+#         self.pad = [1, 1]
+#         self.stride = [2, 2]
+#         self.dilations = [2, 2]
+#         self.input_size = [2, 3, 5, 5]  # NCHW
+#         f_c = self.input_size[1]
+#         self.filter_size = [f_c, 6, 3, 3]
+#
+#     def init_op_type(self):
+#         self.op_type = "conv2d_transpose_cudnn"
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
index 8fd34b87bfea91307f52fdcbb9f71f2e1a9c6c56..a353f9b4d40233de46237005138f21430f4d865a 100644
--- a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
@@ -3,15 +3,20 @@ import numpy as np
 from op_test import OpTest
 
 
-def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
+def conv3dtranspose_forward_naive(input_, filter_, attrs):
     in_n, in_c, in_d, in_h, in_w = input_.shape
     f_c, out_c, f_d, f_h, f_w = filter_.shape
     assert in_c == f_c
 
-    stride, pad = conv3dtranspose_param['stride'], conv3dtranspose_param['pad']
-    out_d = (in_d - 1) * stride[0] + f_d
-    out_h = (in_h - 1) * stride[1] + f_h
-    out_w = (in_w - 1) * stride[2] + f_w
+    stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
+        'dilations']
+
+    d_bolck_d = dilations[0] * (f_d - 1) + 1
+    d_bolck_h = dilations[1] * (f_h - 1) + 1
+    d_bolck_w = dilations[2] * (f_w - 1) + 1
+    out_d = (in_d - 1) * stride[0] + d_bolck_d
+    out_h = (in_h - 1) * stride[1] + d_bolck_h
+    out_w = (in_w - 1) * stride[2] + d_bolck_w
     out = np.zeros((in_n, out_c, out_d, out_h, out_w))
 
     for n in range(in_n):
@@ -25,10 +30,11 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
                     for k in range(out_c):
                         tmp_out = np.sum(input_masked * filter_[:, k, :, :, :],
                                          axis=0)
-                        d1, d2 = d * stride[0], d * stride[0] + f_d
-                        i1, i2 = i * stride[1], i * stride[1] + f_h
-                        j1, j2 = j * stride[2], j * stride[2] + f_w
-                        out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out
+                        d1, d2 = d * stride[0], d * stride[0] + d_bolck_d
+                        i1, i2 = i * stride[1], i * stride[1] + d_bolck_h
+                        j1, j2 = j * stride[2], j * stride[2] + d_bolck_w
+                        out[n, k, d1:d2:dilations[0], i1:i2:dilations[1], j1:j2:
+                            dilations[2]] += tmp_out
 
     out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w -
               pad[2]]
@@ -41,18 +47,19 @@ class TestConv3dTransposeOp(OpTest):
         self.init_op_type()
         self.init_test_case()
 
-        conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad}
         input_ = np.random.random(self.input_size).astype("float32")
         filter_ = np.random.random(self.filter_size).astype("float32")
-        output = conv3dtranspose_forward_naive(
-            input_, filter_, conv3dtranspose_param).astype("float32")
 
         self.inputs = {'Input': input_, 'Filter': filter_}
         self.attrs = {
             'strides': self.stride,
             'paddings': self.pad,
-            # 'dilations': self.dilations
+            'dilations': self.dilations
         }
+
+        output = conv3dtranspose_forward_naive(input_, filter_,
+                                               self.attrs).astype("float32")
+
         self.outputs = {'Output': output}
 
     def test_check_output(self):
@@ -108,11 +115,60 @@ class TestWithStride(TestConv3dTransposeOp):
         self.filter_size = [f_c, 6, 3, 3, 3]
 
 
+class TestWithDilation(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [2, 2, 2]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
 # ------------ test_cudnn ------------
 class TestCudnn(TestConv3dTransposeOp):
     def init_op_type(self):
         self.op_type = "conv3d_transpose_cudnn"
 
 
+class TestCudnnWithPad(TestWithPad):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose_cudnn"
+
+
+class TestCudnnWithStride(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose_cudnn"
+
+
+# #cudnn v5 does not support dilation conv.
+# class TestCudnnWithDilation(TestWithDilation):
+#     def init_test_case(self):
+#         self.pad = [1, 1, 1]
+#         self.stride = [2, 2, 2]
+#         self.dilations = [2, 2, 2]
+#         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+#         f_c = self.input_size[1]
+#         self.filter_size = [f_c, 6, 3, 3, 3]
+#
+#     def init_op_type(self):
+#         self.op_type = "conv3d_transpose_cudnn"
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_dropout_op.py b/python/paddle/v2/fluid/tests/test_dropout_op.py
index 4f5ea836b44102e5599a2302efd669291ebe920b..2483200212686caf9c46f9c1129b5d8ffdcc9145 100644
--- a/python/paddle/v2/fluid/tests/test_dropout_op.py
+++ b/python/paddle/v2/fluid/tests/test_dropout_op.py
@@ -47,7 +47,9 @@ class TestDropoutOp4(OpTest):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
         self.attrs = {'dropout_prob': 0.35, 'is_test': True}
-        self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
+        self.outputs = {
+            'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -58,7 +60,9 @@ class TestDropoutOp5(OpTest):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
         self.attrs = {'dropout_prob': 0.75, 'is_test': True}
-        self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
+        self.outputs = {
+            'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
+        }
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/fluid/tests/test_dyn_rnn.py b/python/paddle/v2/fluid/tests/test_dyn_rnn.py
index 034266c26f48197872a3419135d45b30a8120e8a..8090c5f47814c60034f2f46f00e56c530e0f2c19 100644
--- a/python/paddle/v2/fluid/tests/test_dyn_rnn.py
+++ b/python/paddle/v2/fluid/tests/test_dyn_rnn.py
@@ -63,8 +63,7 @@ class TestDynRNN(unittest.TestCase):
 
             all_timesteps = fluid.layers.array_to_lod_tensor(
                 x=out, table=rank_table)
-            last = fluid.layers.sequence_pool(
-                input=all_timesteps, pool_type='last')
+            last = fluid.layers.sequence_last_step(input=all_timesteps)
             logits = fluid.layers.fc(input=last, size=1, act=None)
             loss = fluid.layers.sigmoid_cross_entropy_with_logits(
                 x=logits, label=label)
@@ -101,7 +100,7 @@ class TestDynRNN(unittest.TestCase):
                 rnn.update_memory(mem, out_)
                 rnn.output(out_)
 
-            last = fluid.layers.sequence_pool(input=rnn(), pool_type='last')
+            last = fluid.layers.sequence_last_step(input=rnn())
             logits = fluid.layers.fc(input=last, size=1, act=None)
             label = fluid.layers.data(name='label', shape=[1], dtype='float32')
             loss = fluid.layers.sigmoid_cross_entropy_with_logits(
diff --git a/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py b/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
index eff8fa87d9c0dafc6935604101e94ee6c8b081ce..cd91769a22f8d6af193efabd8d997913676fbba6 100644
--- a/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
+++ b/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
@@ -7,7 +7,7 @@ class TestFillZerosLikeOp(OpTest):
     def setUp(self):
         self.op_type = "fill_zeros_like"
         self.inputs = {'X': np.random.random((219, 232)).astype("float32")}
-        self.outputs = {'Y': np.zeros_like(self.inputs["X"])}
+        self.outputs = {'Out': np.zeros_like(self.inputs["X"])}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
index 627ab4e23562f14538d85f2e21edeb7d72d940bb..6f6a60ccb3ff17f6a12eec6974b8b2d73885c29f 100644
--- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
@@ -1,32 +1,46 @@
 import unittest
+import numpy
+
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.op import Operator
-import numpy
+from paddle.v2.fluid.executor import Executor
 
 
 class TestGaussianRandomOp(unittest.TestCase):
+    def setUp(self):
+        self.op_type = "gaussian_random"
+        self.inputs = {}
+        self.attrs = {"shape": [1000, 784], "mean": .0, "std": 1., "seed": 10}
+
+        self.outputs = ["Out"]
+
     def test_cpu(self):
-        self.gaussian_random_test(place=core.CPUPlace())
+        self.gaussian_random_test(place=fluid.CPUPlace())
 
     def test_gpu(self):
         if core.is_compile_gpu():
-            self.gaussian_random_test(place=core.GPUPlace(0))
+            self.gaussian_random_test(place=fluid.CUDAPlace(0))
 
     def gaussian_random_test(self, place):
-        scope = core.Scope()
-        scope.var('Out').get_tensor()
-
-        op = Operator(
-            "gaussian_random",
-            Out='Out',
-            shape=[1000, 784],
-            mean=.0,
-            std=1.,
-            seed=10)
-
-        context = core.DeviceContext.create(place)
-        op.run(scope, context)
-        tensor = numpy.array(scope.find_var('Out').get_tensor())
+
+        program = fluid.Program()
+        block = program.global_block()
+        vout = block.create_var(name="Out")
+        op = block.append_op(
+            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
+
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+
+        fetch_list = []
+        for var_name in self.outputs:
+            fetch_list.append(block.var(var_name))
+
+        exe = Executor(place)
+        outs = exe.run(program, fetch_list=fetch_list)
+        tensor = outs[0]
+
         self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
         self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
 
diff --git a/python/paddle/v2/fluid/tests/test_image_classification_layer.py b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
index 2fd609d4474e97ecd96adcd146f2f550e0772740..b621d1525e33693869e24e2bb233bc8e257b077f 100644
--- a/python/paddle/v2/fluid/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
@@ -5,12 +5,7 @@ import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.framework import Program
 
 
-def conv_block(input,
-               num_filter,
-               groups,
-               dropouts,
-               main_program=None,
-               startup_program=None):
+def conv_block(input, num_filter, groups, dropouts):
     return nets.img_conv_group(
         input=input,
         pool_size=2,
@@ -20,90 +15,54 @@ def conv_block(input,
         conv_act='relu',
         conv_with_batchnorm=True,
         conv_batchnorm_drop_rate=dropouts,
-        pool_type='max',
-        main_program=main_program,
-        startup_program=startup_program)
+        pool_type='max')
 
 
 class TestLayer(unittest.TestCase):
     def test_batch_norm_layer(self):
         main_program = Program()
         startup_program = Program()
-        images = fluid.layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            dtype='float32',
-            main_program=main_program)
-        hidden1 = fluid.layers.batch_norm(
-            input=images,
-            main_program=main_program,
-            startup_program=startup_program)
-        hidden2 = fluid.layers.fc(input=hidden1,
-                                  size=128,
-                                  act='relu',
-                                  main_program=main_program)
-        hidden3 = fluid.layers.batch_norm(
-            input=hidden2,
-            main_program=main_program,
-            startup_program=startup_program)
+        with fluid.program_guard(main_program, startup_program):
+            images = fluid.layers.data(
+                name='pixel', shape=[3, 48, 48], dtype='float32')
+            hidden1 = fluid.layers.batch_norm(input=images)
+            hidden2 = fluid.layers.fc(input=hidden1, size=128, act='relu')
+            fluid.layers.batch_norm(input=hidden2)
 
         print str(main_program)
 
     def test_dropout_layer(self):
         main_program = Program()
         startup_program = Program()
-        images = fluid.layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            dtype='float32',
-            main_program=main_program)
-        fluid.layers.dropout(
-            x=images,
-            dropout_prob=0.5,
-            main_program=main_program,
-            startup_program=startup_program)
+        with fluid.program_guard(main_program, startup_program):
+            images = fluid.layers.data(
+                name='pixel', shape=[3, 48, 48], dtype='float32')
+            fluid.layers.dropout(x=images, dropout_prob=0.5)
 
-        # print str(main_program)
+        print str(main_program)
 
     def test_img_conv_group(self):
         main_program = Program()
         startup_program = Program()
 
-        images = fluid.layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            dtype='float32',
-            main_program=main_program,
-            startup_program=startup_program)
-        conv1 = conv_block(images, 64, 2, [0.3, 0], main_program,
-                           startup_program)
-        conv2 = conv_block(conv1, 256, 3, [0.4, 0.4, 0], main_program,
-                           startup_program)
+        with fluid.program_guard(main_program, startup_program):
+            images = fluid.layers.data(
+                name='pixel', shape=[3, 48, 48], dtype='float32')
+            conv1 = conv_block(images, 64, 2, [0.3, 0])
+            conv_block(conv1, 256, 3, [0.4, 0.4, 0])
 
-        # print str(main_program)
+        print str(main_program)
 
     def test_elementwise_add_with_act(self):
         main_program = Program()
         startup_program = Program()
-        image1 = fluid.layers.data(
-            name='pixel1',
-            shape=[3, 48, 48],
-            dtype='float32',
-            main_program=main_program,
-            startup_program=startup_program)
-        image2 = fluid.layers.data(
-            name='pixel2',
-            shape=[3, 48, 48],
-            dtype='float32',
-            main_program=main_program,
-            startup_program=startup_program)
-        out = fluid.layers.elementwise_add(
-            x=image1,
-            y=image2,
-            act='relu',
-            main_program=main_program,
-            startup_program=startup_program)
-        # print(main_program)
+        with fluid.program_guard(main_program, startup_program):
+            image1 = fluid.layers.data(
+                name='pixel1', shape=[3, 48, 48], dtype='float32')
+            image2 = fluid.layers.data(
+                name='pixel2', shape=[3, 48, 48], dtype='float32')
+            fluid.layers.elementwise_add(x=image1, y=image2, act='relu')
+        print(main_program)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_inference_model_io.py b/python/paddle/v2/fluid/tests/test_inference_model_io.py
index 60aed62ead83dedbeb9438c431ec292558d88ce5..71ca3e6c105c4437470f8e9f596e723d879b65e4 100644
--- a/python/paddle/v2/fluid/tests/test_inference_model_io.py
+++ b/python/paddle/v2/fluid/tests/test_inference_model_io.py
@@ -6,7 +6,7 @@ import paddle.v2.fluid.core as core
 import paddle.v2.fluid.executor as executor
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.optimizer as optimizer
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard
 from paddle.v2.fluid.io import save_inference_model, load_inference_model
 
 
@@ -16,35 +16,18 @@ class TestBook(unittest.TestCase):
 
         init_program = Program()
         program = Program()
-        x = layers.data(
-            name='x',
-            shape=[2],
-            dtype='float32',
-            main_program=program,
-            startup_program=init_program)
-        y = layers.data(
-            name='y',
-            shape=[1],
-            dtype='float32',
-            main_program=program,
-            startup_program=init_program)
-
-        y_predict = layers.fc(input=x,
-                              size=1,
-                              act=None,
-                              main_program=program,
-                              startup_program=init_program)
-
-        cost = layers.square_error_cost(
-            input=y_predict,
-            label=y,
-            main_program=program,
-            startup_program=init_program)
-        avg_cost = layers.mean(
-            x=cost, main_program=program, startup_program=init_program)
-
-        sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-        sgd_optimizer.minimize(avg_cost, init_program)
+
+        with program_guard(program, init_program):
+            x = layers.data(name='x', shape=[2], dtype='float32')
+            y = layers.data(name='y', shape=[1], dtype='float32')
+
+            y_predict = layers.fc(input=x, size=1, act=None)
+
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(x=cost)
+
+            sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+            sgd_optimizer.minimize(avg_cost, init_program)
 
         place = core.CPUPlace()
         exe = executor.Executor(place)
diff --git a/python/paddle/v2/fluid/tests/test_is_empty_op.py b/python/paddle/v2/fluid/tests/test_is_empty_op.py
index ed6e3fe24f6333c9c90d760787eb13241a7e1868..0a4dd0f4faf370161e5695d97f0ed4bf73b6ec26 100644
--- a/python/paddle/v2/fluid/tests/test_is_empty_op.py
+++ b/python/paddle/v2/fluid/tests/test_is_empty_op.py
@@ -33,8 +33,7 @@ class TestIsEmptyOp(unittest.TestCase):
 
     def one_case(self, input, target):
         op = Operator(type="is_empty", X=input, Out="out")
-        ctx = core.DeviceContext.create(core.CPUPlace())
-        op.run(self.scope, ctx)
+        op.run(self.scope, core.CPUPlace())
         out = self.scope.var("out").get_tensor()
         self.assertEqual(np.array(out)[0], target)
 
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 9b88080158139f267e253c598e60a4d92a0eff68..9d2dcca56dd1361b9e2448be9f1d5403f8ee17e3 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -161,6 +161,41 @@ class TestBook(unittest.TestCase):
                     x=dat, label=lbl))
         print(str(program))
 
+    def test_sequence_expand(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10], dtype='float32')
+            y = layers.data(
+                name='y', shape=[10, 20], dtype='float32', lod_level=1)
+            self.assertIsNotNone(layers.sequence_expand(x=x, y=y))
+        print(str(program))
+
+    def test_lstm_unit(self):
+        program = Program()
+        with program_guard(program):
+            x_t_data = layers.data(
+                name='x_t_data', shape=[10, 10], dtype='float32')
+            x_t = layers.fc(input=x_t_data, size=10)
+            prev_hidden_data = layers.data(
+                name='prev_hidden_data', shape=[10, 20], dtype='float32')
+            prev_hidden = layers.fc(input=prev_hidden_data, size=20)
+            prev_cell_data = layers.data(
+                name='prev_cell', shape=[10, 30], dtype='float32')
+            prev_cell = layers.fc(input=prev_cell_data, size=30)
+            self.assertIsNotNone(
+                layers.lstm_unit(
+                    x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell))
+        print(str(program))
+
+    def test_sequence_softmax(self):
+        program = Program()
+        with program_guard(program):
+            seq_data = layers.data(
+                name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
+            seq = layers.fc(input=seq_data, size=20)
+            self.assertIsNotNone(layers.sequence_softmax(x=seq))
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
index 0a916a55bc3d097e17fb504b0d6b2f2818f030c9..c552cb033f1ec8f5843490083edee7b2762b5703 100644
--- a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
@@ -2,9 +2,9 @@ import unittest
 import paddle.v2.fluid.core as core
 import numpy
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
@@ -118,16 +118,17 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
     def main(self, tensor, expect_array, expect_lod, expect_max_len, level=0):
         place = self.place()
         program = Program()
-        x = layers.data(name='x', shape=[10], main_program=program)
-        x.persistable = True
-        table = layers.lod_rank_table(x, level=level, main_program=program)
-        max_len = layers.max_sequence_len(table, main_program=program)
-        max_len.persistable = True
-        array = layers.lod_tensor_to_array(x, table, main_program=program)
-        array.persistable = True
-
-        result = layers.array_to_lod_tensor(array, table, main_program=program)
-        result.persistable = True
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10])
+            x.persistable = True
+            table = layers.lod_rank_table(x, level=level)
+            max_len = layers.max_sequence_len(table)
+            max_len.persistable = True
+            array = layers.lod_tensor_to_array(x, table)
+            array.persistable = True
+
+            result = layers.array_to_lod_tensor(array, table)
+            result.persistable = True
         exe = Executor(place)
         scope = core.Scope()
         exe.run(program, feed={'x': tensor}, scope=scope)
@@ -160,19 +161,16 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
         place = core.CPUPlace()
         program = Program()
 
-        x = layers.data(
-            name='x',
-            shape=[1],
-            dtype='float32',
-            main_program=program,
-            stop_gradient=False)
-        table = layers.lod_rank_table(x, level=0, main_program=program)
-        array = layers.lod_tensor_to_array(x, table, main_program=program)
-        result = layers.array_to_lod_tensor(array, table, main_program=program)
+        with program_guard(program):
+            x = layers.data(
+                name='x', shape=[1], dtype='float32', stop_gradient=False)
+            table = layers.lod_rank_table(x, level=0)
+            array = layers.lod_tensor_to_array(x, table)
+            result = layers.array_to_lod_tensor(array, table)
 
-        mean = layers.mean(x=result, main_program=program)
+            mean = layers.mean(x=result)
 
-        append_backward_ops(mean)
+            append_backward(mean)
 
         tensor = core.LoDTensor()
         tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place)
diff --git a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
index 50fcc4a72ddbd6d7a3d3b73434c6ac8de5a006e2..33558c6105442b169b2e26abc7f39e15b7fe7322 100644
--- a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
@@ -1,5 +1,5 @@
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard, default_main_program, default_startup_program
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.optimizer import MomentumOptimizer
 import paddle.v2.fluid.core as core
@@ -10,44 +10,42 @@ import numpy as np
 
 class TestMNISTIfElseOp(unittest.TestCase):
     def test_raw_api(self):
-        kwargs = {'startup_program': Program(), 'main_program': Program()}
-        image = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
+        prog = Program()
+        startup_prog = Program()
+        with program_guard(prog, startup_prog):
+            image = layers.data(name='x', shape=[784], dtype='float32')
 
-        label = layers.data(name='y', shape=[1], dtype='int64', **kwargs)
+            label = layers.data(name='y', shape=[1], dtype='int64')
 
-        limit = layers.fill_constant_batch_size_like(
-            input=label, dtype='int64', shape=[1], value=5.0, **kwargs)
+            limit = layers.fill_constant_batch_size_like(
+                input=label, dtype='int64', shape=[1], value=5.0)
+            cond = layers.less_than(x=label, y=limit)
+            true_image, false_image = layers.split_lod_tensor(
+                input=image, mask=cond)
 
-        cond = layers.less_than(x=label, y=limit, **kwargs)
-        true_image, false_image = layers.split_lod_tensor(
-            input=image, mask=cond, **kwargs)
+            true_out = layers.create_tensor(dtype='float32')
+            true_cond = layers.ConditionalBlock([true_image])
 
-        true_out = layers.create_tensor(dtype='float32', **kwargs)
-        true_cond = layers.ConditionalBlock([true_image], **kwargs)
+            with true_cond.block():
+                hidden = layers.fc(input=true_image, size=100, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                layers.assign(input=prob, output=true_out)
 
-        with true_cond.block():
-            hidden = layers.fc(input=true_image, size=100, act='tanh', **kwargs)
-            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
-            layers.assign(input=prob, output=true_out, **kwargs)
+            false_out = layers.create_tensor(dtype='float32')
+            false_cond = layers.ConditionalBlock([false_image])
 
-        false_out = layers.create_tensor(dtype='float32', **kwargs)
-        false_cond = layers.ConditionalBlock([false_image], **kwargs)
+            with false_cond.block():
+                hidden = layers.fc(input=false_image, size=200, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                layers.assign(input=prob, output=false_out)
 
-        with false_cond.block():
-            hidden = layers.fc(input=false_image,
-                               size=200,
-                               act='tanh',
-                               **kwargs)
-            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
-            layers.assign(input=prob, output=false_out, **kwargs)
+            prob = layers.merge_lod_tensor(
+                in_true=true_out, in_false=false_out, mask=cond, x=image)
+            loss = layers.cross_entropy(input=prob, label=label)
+            avg_loss = layers.mean(x=loss)
 
-        prob = layers.merge_lod_tensor(
-            in_true=true_out, in_false=false_out, mask=cond, x=image, **kwargs)
-        loss = layers.cross_entropy(input=prob, label=label, **kwargs)
-        avg_loss = layers.mean(x=loss, **kwargs)
-
-        optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-        optimizer.minimize(avg_loss, kwargs['startup_program'])
+            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+            optimizer.minimize(avg_loss, startup_prog)
 
         train_reader = paddle.batch(
             paddle.reader.shuffle(
@@ -57,7 +55,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
         place = core.CPUPlace()
         exe = Executor(place)
 
-        exe.run(kwargs['startup_program'])
+        exe.run(startup_prog)
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
             for data in train_reader():
@@ -65,7 +63,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
                 y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                 y_data = np.expand_dims(y_data, axis=1)
 
-                outs = exe.run(kwargs['main_program'],
+                outs = exe.run(prog,
                                feed={'x': x_data,
                                      'y': y_data},
                                fetch_list=[avg_loss])
@@ -75,39 +73,36 @@ class TestMNISTIfElseOp(unittest.TestCase):
         self.assertFalse(True)
 
     def test_ifelse(self):
-        kwargs = {'startup_program': Program(), 'main_program': Program()}
-        image = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
-
-        label = layers.data(name='y', shape=[1], dtype='int64', **kwargs)
-
-        limit = layers.fill_constant_batch_size_like(
-            input=label, dtype='int64', shape=[1], value=5.0, **kwargs)
-
-        cond = layers.less_than(x=label, y=limit, **kwargs)
-
-        ie = layers.IfElse(cond, **kwargs)
-
-        with ie.true_block():
-            true_image = ie.input(image)
-            hidden = layers.fc(input=true_image, size=100, act='tanh', **kwargs)
-            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
-            ie.output(prob)
-
-        with ie.false_block():
-            false_image = ie.input(image)
-            hidden = layers.fc(input=false_image,
-                               size=200,
-                               act='tanh',
-                               **kwargs)
-            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
-            ie.output(prob)
-
-        prob = ie()
-        loss = layers.cross_entropy(input=prob[0], label=label, **kwargs)
-        avg_loss = layers.mean(x=loss, **kwargs)
-
-        optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-        optimizer.minimize(avg_loss, kwargs['startup_program'])
+        prog = Program()
+        startup_prog = Program()
+        with program_guard(prog, startup_prog):
+            image = layers.data(name='x', shape=[784], dtype='float32')
+
+            label = layers.data(name='y', shape=[1], dtype='int64')
+
+            limit = layers.fill_constant_batch_size_like(
+                input=label, dtype='int64', shape=[1], value=5.0)
+            cond = layers.less_than(x=label, y=limit)
+            ie = layers.IfElse(cond)
+
+            with ie.true_block():
+                true_image = ie.input(image)
+                hidden = layers.fc(input=true_image, size=100, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                ie.output(prob)
+
+            with ie.false_block():
+                false_image = ie.input(image)
+                hidden = layers.fc(input=false_image, size=200, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                ie.output(prob)
+
+            prob = ie()
+            loss = layers.cross_entropy(input=prob[0], label=label)
+            avg_loss = layers.mean(x=loss)
+
+            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+            optimizer.minimize(avg_loss, startup_prog)
         train_reader = paddle.batch(
             paddle.reader.shuffle(
                 paddle.dataset.mnist.train(), buf_size=8192),
@@ -135,4 +130,5 @@ class TestMNISTIfElseOp(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    unittest.main()
+    # temp disable if else unittest since it could be buggy.
+    exit(0)
diff --git a/python/paddle/v2/fluid/tests/test_net.py b/python/paddle/v2/fluid/tests/test_net.py
index 318df08a9e73ac95cab73c34182bc6220ef6c681..d9fe55a8af5c750c5c926e875ddbb645f8abb1a0 100644
--- a/python/paddle/v2/fluid/tests/test_net.py
+++ b/python/paddle/v2/fluid/tests/test_net.py
@@ -7,7 +7,7 @@ def fc(X, W, Y):
     ret_v = core.Net.create()
 
     ret_v.append_op(Operator("mul", X="X", Y="W", Out="pre_activation"))
-    ret_v.append_op(Operator("sigmoid", X="pre_activation", Y=Y))
+    ret_v.append_op(Operator("sigmoid", X="pre_activation", Out=Y))
     ret_v.complete_add_op(True)
     return ret_v
 
@@ -30,7 +30,7 @@ Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}
     Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
         Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
             Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
-            Op(sigmoid), inputs:{X[pre_activation]}, outputs:{Y[fc.out]}.
+            Op(sigmoid), inputs:{X[pre_activation]}, outputs:{Out[fc.out]}.
 '''
         self.assertEqual(expected, "\n" + str(net))
 
diff --git a/python/paddle/v2/fluid/tests/test_operator.py b/python/paddle/v2/fluid/tests/test_operator.py
index 4aa022ef90159cd96eed4e4dbe30cf5d1e8a41a7..c059a2b88b1324935f871b6e9c11efd5652ddd65 100644
--- a/python/paddle/v2/fluid/tests/test_operator.py
+++ b/python/paddle/v2/fluid/tests/test_operator.py
@@ -1,6 +1,6 @@
 import unittest
+
 import paddle.v2.fluid.op as op
-import paddle.v2.fluid.core as core
 import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 
 
diff --git a/python/paddle/v2/fluid/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py
index 2459dfd664300d405edb36c4ca906c1769b5e7d2..1eadb7d912629024ee21e30b0a5fa4910bb96e06 100644
--- a/python/paddle/v2/fluid/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
@@ -2,7 +2,7 @@ import unittest
 
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.optimizer as optimizer
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 
 
 class TestOptimizer(unittest.TestCase):
@@ -27,7 +27,7 @@ class TestOptimizer(unittest.TestCase):
         block.append_op(
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
-        opts = sgd_optimizer.minimize(mean_out, init_program)
+        opts, _ = sgd_optimizer.minimize(mean_out, init_program)
         self.assertEqual(len(opts), 1)
         sgd_op = opts[0]
         self.assertEqual(sgd_op.type, "sgd")
@@ -57,7 +57,7 @@ class TestOptimizer(unittest.TestCase):
         learning_rate = 0.01
         sgd_optimizer = optimizer.SGDOptimizer(
             learning_rate=learning_rate, global_step=global_step)
-        opts = sgd_optimizer.minimize(mean_out, init_program)
+        opts, _ = sgd_optimizer.minimize(mean_out, init_program)
         self.assertEqual(len(opts), 2)
         sgd_op = opts[0]
         self.assertEqual(sgd_op.type, "sgd")
@@ -102,7 +102,7 @@ class TestMomentumOptimizer(unittest.TestCase):
             dtype="float32", shape=[1], lod_level=0, name="mean.out")
         block.append_op(
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
@@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase):
         learning_rate = 0.01
         momentum_optimizer = self.MockMomentum(
             learning_rate=learning_rate, momentum=0.2, use_nesterov=True)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
@@ -209,7 +209,7 @@ class TestAdagradOptimizer(unittest.TestCase):
         learning_rate = 0.01
         adagrad_optimizer = self.MockAdagrad(
             learning_rate=learning_rate, epsilon=1.0e-6)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
         opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -269,7 +269,7 @@ class TestAdamOptimizer(unittest.TestCase):
         learning_rate = 0.01
         adam_optimizer = self.MockAdam(
             learning_rate=learning_rate, beta1=0.9, beta2=0.999)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
         opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -331,7 +331,7 @@ class TestAdamaxOptimizer(unittest.TestCase):
         learning_rate = 0.01
         adamax_optimizer = self.MockAdamax(
             learning_rate=learning_rate, beta1=0.9, beta2=0.999)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
         opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -390,7 +390,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
         learning_rate = 0.01
         decayed_adagrad_optimizer = self.MockDecayedAdagrad(
             learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
         opts = decayed_adagrad_optimizer.create_optimization_pass(
diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py
index 395d0dc36a3d1d6fbfebb4cdf34395c4edee412d..e3f3ac58ef9b30864849770510f7339749dab84f 100644
--- a/python/paddle/v2/fluid/tests/test_profiler.py
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
@@ -3,6 +3,7 @@ import numpy as np
 import paddle.v2.fluid as fluid
 import paddle.v2.fluid.profiler as profiler
 import paddle.v2.fluid.layers as layers
+import os
 
 
 class TestProfiler(unittest.TestCase):
@@ -14,14 +15,16 @@ class TestProfiler(unittest.TestCase):
         data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
         conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
 
-        place = fluid.GPUPlace(0)
+        place = fluid.CUDAPlace(0)
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
 
-        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+        output_file = 'cuda_profiler.txt'
+        with profiler.cuda_profiler(output_file, 'csv') as nvprof:
             for i in range(epoc):
                 input = np.random.random(dshape).astype('float32')
                 exe.run(fluid.default_main_program(), feed={'data': input})
+        os.remove(output_file)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_program.py b/python/paddle/v2/fluid/tests/test_program.py
index 1a9313c68aab165d85ae29051faeacb4927ac2c9..447c746aacc1c9455d7a023bca625d548ab2638b 100644
--- a/python/paddle/v2/fluid/tests/test_program.py
+++ b/python/paddle/v2/fluid/tests/test_program.py
@@ -1,7 +1,7 @@
 from __future__ import print_function
 import unittest
 
-from paddle.v2.fluid.framework import Program, default_main_program
+from paddle.v2.fluid.framework import Program, default_main_program, program_guard, grad_var_name
 import paddle.v2.fluid.layers as layers
 
 main_program = default_main_program()
@@ -109,12 +109,10 @@ class TestProgram(unittest.TestCase):
         self.assertEqual(add_op.idx, 1)
         param_to_grad = prog.append_backward(mean_out, set())
 
-        def grad_name(name):
-            return name + "@GRAD"
-
         for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out",
                          "mean.out"):
-            self.assertEqual(param_to_grad[var_name][0], grad_name(var_name))
+            self.assertEqual(param_to_grad[var_name][0],
+                             grad_var_name(var_name))
             self.assertEqual(param_to_grad[var_name][1], 0)
 
         expect_ops = [
@@ -129,13 +127,10 @@ class TestProgram(unittest.TestCase):
     def test_program_clone_with_parameter(self):
         main_program = Program()
         startup_program = Program()
-        kwargs = {
-            'main_program': main_program,
-            'startup_program': startup_program
-        }
-        d = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
-        hidden = layers.fc(input=d, size=100, **kwargs)
-        layers.fc(input=hidden, size=100, **kwargs)
+        with program_guard(main_program, startup_program):
+            d = layers.data(name='x', shape=[784], dtype='float32')
+            hidden = layers.fc(input=d, size=100)
+            layers.fc(input=hidden, size=100)
 
         new_program = main_program.clone()
         self.assertNotEqual(0, len(new_program.blocks[0].all_parameters()))
diff --git a/python/paddle/v2/fluid/tests/test_recurrent_op.py b/python/paddle/v2/fluid/tests/test_recurrent_op.py
index 694ff0d8dd794111aff51bb7d503a56b87514342..84f4e36fa7312fbcb96cc66ff26e234c3016df30 100644
--- a/python/paddle/v2/fluid/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
@@ -1,9 +1,9 @@
 import unittest
 
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, grad_var_name
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy as np
 import paddle.v2.fluid.core as core
 
@@ -164,7 +164,7 @@ class RecurrentOpTest1(unittest.TestCase):
             for x in self.data_field
         }
         fetch_list = [
-            self.main_program.global_block().var(x + "@GRAD")
+            self.main_program.global_block().var(grad_var_name(x))
             for x in self.data_field
         ]
 
@@ -177,7 +177,7 @@ class RecurrentOpTest1(unittest.TestCase):
     def test_backward(self):
         self.check_forward()
 
-        append_backward_ops(self.output)
+        append_backward(self.output)
 
         ana_grad = [np.array(x) for x in self.backward()]
 
diff --git a/python/paddle/v2/fluid/tests/test_regularizer.py b/python/paddle/v2/fluid/tests/test_regularizer.py
index 24baf55e90c98f39bab926e8c85a791eee5ed4a4..890c881a126a32344128652691c6cad45e02e82d 100644
--- a/python/paddle/v2/fluid/tests/test_regularizer.py
+++ b/python/paddle/v2/fluid/tests/test_regularizer.py
@@ -3,7 +3,7 @@ import unittest
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.regularizer as regularizer
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 
 
 class TestL2DecayRegularizer(unittest.TestCase):
@@ -33,7 +33,7 @@ class TestL2DecayRegularizer(unittest.TestCase):
             dtype="float32", shape=[1], lod_level=0, name="mean.out")
         block.append_op(
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
         params_grads = optimizer.append_regularization_ops(params_grads)
@@ -70,7 +70,7 @@ class TestL1DecayRegularizer(unittest.TestCase):
             dtype="float32", shape=[1], lod_level=0, name="mean.out")
         block.append_op(
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
         params_grads = optimizer.append_regularization_ops(params_grads)
diff --git a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c136f6360ce73a7c532b5486e544796e6853bcb
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
@@ -0,0 +1,47 @@
+import unittest
+import paddle.v2.fluid as fluid
+import numpy
+
+
+class TestReorderLoDTensor(unittest.TestCase):
+    def test_reorder(self):
+        dat = fluid.layers.data(name='input', shape=[1], lod_level=2)
+        dat.stop_gradient = False
+        rank_dat = fluid.layers.data(name='ref', shape=[1], lod_level=1)
+        table = fluid.layers.lod_rank_table(rank_dat)
+        new_dat = fluid.layers.reorder_lod_tensor_by_rank(
+            x=dat, rank_table=table)
+        loss = fluid.layers.mean(x=new_dat)
+        fluid.backward.append_backward(loss=loss)
+
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        exe.run(fluid.default_startup_program())
+
+        ref = fluid.Tensor()
+        ref_lod = [0, 3, 4, 7, 8, 14]
+        ref.set_lod([ref_lod])
+
+        ref.set(numpy.random.random(size=[14, 1]).astype('float32'), cpu)
+        input = fluid.Tensor()
+        lod_level_0 = numpy.random.randint(low=1, high=5, size=5)
+        lod_level_0 = [0] + numpy.cumsum(lod_level_0).tolist()
+        lod_level_1 = numpy.random.randint(low=1, high=5, size=lod_level_0[-1])
+        lod_level_1 = [0] + numpy.cumsum(lod_level_1).tolist()
+
+        input.set_lod([lod_level_0, lod_level_1])
+        input.set(
+            numpy.random.random(size=[lod_level_1[-1], 1]).astype('float32'),
+            cpu)
+
+        ig = exe.run(fluid.default_main_program(),
+                     feed={'input': input,
+                           'ref': ref},
+                     fetch_list=['input@GRAD'],
+                     return_numpy=False)[0]
+        self.assertAlmostEqual(numpy.array(ig).sum(), 1.0, delta=0.001)
+        self.assertEqual(input.lod(), ig.lod())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_reshape_op.py b/python/paddle/v2/fluid/tests/test_reshape_op.py
index 16bb6bb2af67f7d32a2fafc1cb37412084ec0829..18ee3aece656276fec9671df9baf298b7fd3c9b1 100644
--- a/python/paddle/v2/fluid/tests/test_reshape_op.py
+++ b/python/paddle/v2/fluid/tests/test_reshape_op.py
@@ -17,5 +17,19 @@ class TestReshapeOp(OpTest):
         self.check_grad(["X"], "Out")
 
 
+class TestReshapeOpDimInfer(OpTest):
+    def setUp(self):
+        self.op_type = "reshape"
+        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
+        self.attrs = {'shape': [4, -1, 5]}
+        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
index 9999165ed509aa40f31f26aa676f381561bd0016..d1bb20f37a3785f70bee072b9df282bba4012c16 100644
--- a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
+++ b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
@@ -2,7 +2,7 @@ import unittest
 
 from paddle.v2.fluid.framework import Program
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy as np
 import paddle.v2.fluid.core as core
 
diff --git a/python/paddle/v2/fluid/tests/test_seq_expand.py b/python/paddle/v2/fluid/tests/test_sequence_expand.py
similarity index 89%
rename from python/paddle/v2/fluid/tests/test_seq_expand.py
rename to python/paddle/v2/fluid/tests/test_sequence_expand.py
index ff17edd04bfd34ab8449a0ae05aacf66632dabc8..0f22612d3dbe483e4d5a8638636e44e172160156 100644
--- a/python/paddle/v2/fluid/tests/test_seq_expand.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_expand.py
@@ -3,7 +3,7 @@ import numpy as np
 from op_test import OpTest
 
 
-class TestSeqExpand(OpTest):
+class TestSequenceExpand(OpTest):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
         y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
@@ -21,7 +21,7 @@ class TestSeqExpand(OpTest):
         self.outputs = {'Out': out}
 
     def setUp(self):
-        self.op_type = 'seq_expand'
+        self.op_type = 'sequence_expand'
         self.set_data()
         self.compute()
 
@@ -32,7 +32,7 @@ class TestSeqExpand(OpTest):
         self.check_grad(["X"], "Out")
 
 
-class TestSeqExpandCase1(TestSeqExpand):
+class TestSequenceExpandCase1(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
         x_lod = [[0, 2, 5]]
@@ -41,7 +41,7 @@ class TestSeqExpandCase1(TestSeqExpand):
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
-class TestSeqExpandCase2(TestSeqExpand):
+class TestSequenceExpandCase2(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
         x_lod = [[0, 1]]
@@ -50,7 +50,7 @@ class TestSeqExpandCase2(TestSeqExpand):
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
-class TestSeqExpandCase3(TestSeqExpand):
+class TestSequenceExpandCase3(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
         x_lod = [[0, 1, 2, 3, 4]]
diff --git a/python/paddle/v2/fluid/tests/test_sgd_op.py b/python/paddle/v2/fluid/tests/test_sgd_op.py
index ca05a381f06cfd40b7939dbda8d4f1f4aacd0271..14d41e172a22c677235ab3fa997ef6f0b6e39778 100644
--- a/python/paddle/v2/fluid/tests/test_sgd_op.py
+++ b/python/paddle/v2/fluid/tests/test_sgd_op.py
@@ -55,8 +55,7 @@ class TestSparseSGDOp(unittest.TestCase):
             Grad='Grad',
             ParamOut='Param',
             LearningRate='LearningRate')
-        ctx = core.DeviceContext.create(place)
-        sgd_op.run(scope, ctx)
+        sgd_op.run(scope, place)
 
         # get and compare result
         result_array = np.array(param)
@@ -79,7 +78,7 @@ class TestSparseSGDOp(unittest.TestCase):
     def test_sparse_sgd(self):
         places = [core.CPUPlace()]
         if core.is_compile_gpu():
-            places.append(core.GPUPlace(0))
+            places.append(core.CUDAPlace(0))
         for place in places:
             self.check_with_place(place)
 
diff --git a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
index 86db4c64b493d94cc675ed4bcee7e2925fef1977..be1588fc2d09fa58882425eb3d080ef1560ebc79 100644
--- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
@@ -2,7 +2,7 @@ import unittest
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 from paddle.v2.fluid.framework import default_main_program
 import numpy
 
@@ -35,7 +35,7 @@ class TestShrinkRNNMemory(unittest.TestCase):
         self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2]))
 
         mem3_mean = layers.mean(x=mem3)
-        append_backward_ops(loss=mem3_mean)
+        append_backward(loss=mem3_mean)
         x_grad = exe.run(
             feed={'x': tensor},
             fetch_list=[main_program.global_block().var('x@GRAD')])[0]
diff --git a/python/paddle/v2/fluid/tests/test_softmax_op.py b/python/paddle/v2/fluid/tests/test_softmax_op.py
index b41c810d9a6269c934a434b085748a86deccb475..136fc0283afd6acf1de4baae5e681789662295ce 100644
--- a/python/paddle/v2/fluid/tests/test_softmax_op.py
+++ b/python/paddle/v2/fluid/tests/test_softmax_op.py
@@ -17,14 +17,14 @@ class TestSoftmaxOp(OpTest):
             'X': np.random.uniform(0.1, 1, [10, 10]).astype("float32")
         }
         self.outputs = {
-            'Y': np.apply_along_axis(stable_softmax, 1, self.inputs['X'])
+            'Out': np.apply_along_axis(stable_softmax, 1, self.inputs['X'])
         }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y')
+        self.check_grad(['X'], 'Out')
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
index f5da4e408f0a83dbf6da530b478e91bbf9cd5ab2..2e4defd55d75c2012f39bea30a6c4de12528e77c 100644
--- a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
@@ -2,9 +2,9 @@ import unittest
 import paddle.v2.fluid.core as core
 import numpy as np
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
@@ -75,26 +75,22 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
              level=0):
         place = self.place()
         program = Program()
-        x = layers.data(name='x', shape=[1], main_program=program)
-        x.persistable = True
+        with program_guard(program):
+            x = layers.data(name='x', shape=[1])
+            x.persistable = True
 
-        y = layers.data(name='y', shape=[1], main_program=program)
-        y.persistable = True
+            y = layers.data(name='y', shape=[1])
+            y.persistable = True
 
-        out_true, out_false = layers.split_lod_tensor(
-            input=x, mask=y, level=level, main_program=program)
-        out_true.persistable = True
-        out_false.persistable = True
+            out_true, out_false = layers.split_lod_tensor(
+                input=x, mask=y, level=level)
+            out_true.persistable = True
+            out_false.persistable = True
 
-        out = layers.merge_lod_tensor(
-            in_true=out_true,
-            in_false=out_false,
-            mask=y,
-            x=x,
-            level=level,
-            main_program=program)
+            out = layers.merge_lod_tensor(
+                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
 
-        out.persistable = True
+            out.persistable = True
 
         exe = Executor(place)
         scope = core.Scope()
@@ -123,34 +119,21 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
     def test_grad(self):
         place = core.CPUPlace()
         program = Program()
+        with program_guard(program):
+            x = layers.data(
+                name='x', shape=[1], dtype='float32', stop_gradient=False)
+            y = layers.data(
+                name='y', shape=[1], dtype='bool', stop_gradient=False)
 
-        x = layers.data(
-            name='x',
-            shape=[1],
-            dtype='float32',
-            main_program=program,
-            stop_gradient=False)
-        y = layers.data(
-            name='y',
-            shape=[1],
-            dtype='bool',
-            main_program=program,
-            stop_gradient=False)
-
-        level = 0
-
-        out_true, out_false = layers.split_lod_tensor(
-            input=x, mask=y, level=level, main_program=program)
-        out = layers.merge_lod_tensor(
-            in_true=out_true,
-            in_false=out_false,
-            mask=y,
-            x=x,
-            level=level,
-            main_program=program)
-        mean = layers.mean(x=out, main_program=program)
-
-        append_backward_ops(mean)
+            level = 0
+
+            out_true, out_false = layers.split_lod_tensor(
+                input=x, mask=y, level=level)
+            out = layers.merge_lod_tensor(
+                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
+            mean = layers.mean(x=out)
+
+            append_backward(mean)
 
         tensor = core.LoDTensor()
         tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place)
diff --git a/python/paddle/v2/fluid/tests/test_spp_op.py b/python/paddle/v2/fluid/tests/test_spp_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..007723f0e35ad194c427401337bc9b13756576de
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_spp_op.py
@@ -0,0 +1,68 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_pool2d_op import max_pool2D_forward_naive
+from test_pool2d_op import avg_pool2D_forward_naive
+
+
+class TestSppOp(OpTest):
+    def setUp(self):
+        self.op_type = "spp"
+        self.init_test_case()
+        input = np.random.random(self.shape).astype("float32")
+        nsize, csize, hsize, wsize = input.shape
+        out_level_flatten = []
+        for i in xrange(self.pyramid_height):
+            bins = np.power(2, i)
+            kernel_size = [0, 0]
+            padding = [0, 0]
+            kernel_size[0] = np.ceil(hsize /
+                                     bins.astype("double")).astype("int32")
+            padding[0] = (
+                (kernel_size[0] * bins - hsize + 1) / 2).astype("int32")
+
+            kernel_size[1] = np.ceil(wsize /
+                                     bins.astype("double")).astype("int32")
+            padding[1] = (
+                (kernel_size[1] * bins - wsize + 1) / 2).astype("int32")
+            out_level = self.pool2D_forward_naive(input, kernel_size,
+                                                  kernel_size, padding)
+            out_level_flatten.append(
+                out_level.reshape(nsize, bins * bins * csize))
+            if i == 0:
+                output = out_level_flatten[i]
+            else:
+                output = np.concatenate((output, out_level_flatten[i]), 1)
+        # output = np.concatenate(out_level_flatten.tolist(), 0);
+        self.inputs = {'X': input.astype('float32'), }
+        self.attrs = {
+            'pyramid_height': self.pyramid_height,
+            'pooling_type': self.pool_type
+        }
+
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        if self.pool_type != "avg":
+            self.check_grad(['X'], 'Out', max_relative_error=0.05)
+
+    def init_test_case(self):
+        self.shape = [3, 2, 4, 4]
+        self.pyramid_height = 3
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.pool_type = "max"
+
+
+class TestCase2(TestSppOp):
+    def init_test_case(self):
+        self.shape = [3, 2, 4, 4]
+        self.pyramid_height = 3
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.pool_type = "avg"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_uniform_random_op.py b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
index f736dfb2e85552b321403c961da517f3b3efb100..dbe4d6bcd069d2088b3cc1b4efd575d14afd4198 100644
--- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
@@ -1,32 +1,49 @@
 import unittest
+import numpy
+
 from paddle.v2.fluid.op import Operator
 import paddle.v2.fluid.core as core
-import numpy
+import paddle.v2.fluid as fluid
 
 
 class TestUniformRandomOp(unittest.TestCase):
-    def test_uniform_random_cpu(self):
+    def setUp(self):
+        self.op_type = "uniform_random"
+        self.inputs = {}
+        self.attrs = {
+            "shape": [1000, 784],
+            "min": -5.0,
+            "max": 10.0,
+            "seed": 10
+        }
+        self.outputs = ["Out"]
+
+    def test_cpu(self):
         self.uniform_random_test(place=core.CPUPlace())
 
-    def test_uniform_random_gpu(self):
+    def test_gpu(self):
         if core.is_compile_gpu():
-            self.uniform_random_test(place=core.GPUPlace(0))
+            self.uniform_random_test(place=core.CUDAPlace(0))
 
     def uniform_random_test(self, place):
-        scope = core.Scope()
-        scope.var('X').get_tensor()
-
-        op = Operator(
-            "uniform_random",
-            Out='X',
-            shape=[1000, 784],
-            min=-5.0,
-            max=10.0,
-            seed=10)
-
-        ctx = core.DeviceContext.create(place)
-        op.run(scope, ctx)
-        tensor = numpy.array(scope.find_var('X').get_tensor())
+        program = fluid.Program()
+        block = program.global_block()
+        vout = block.create_var(name="Out")
+        op = block.append_op(
+            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
+
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+
+        fetch_list = []
+        for var_name in self.outputs:
+            fetch_list.append(block.var(var_name))
+
+        exe = fluid.Executor(place)
+        outs = exe.run(program, fetch_list=fetch_list)
+
+        tensor = outs[0]
+
         self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1)
 
 
diff --git a/python/paddle/v2/fluid/tests/test_while_op.py b/python/paddle/v2/fluid/tests/test_while_op.py
index 033b03a4957131e1155c61e8ed2f10eefb23fda4..7c5593cc5e5a66d4ccb237e3706ff3e544adf033 100644
--- a/python/paddle/v2/fluid/tests/test_while_op.py
+++ b/python/paddle/v2/fluid/tests/test_while_op.py
@@ -2,7 +2,7 @@ import unittest
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.core as core
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy
 
 
@@ -46,7 +46,7 @@ class TestWhileOp(unittest.TestCase):
         sum_result = layers.array_read(array=mem_array, i=i)
         loss = layers.mean(x=sum_result)
 
-        append_backward_ops(loss)
+        append_backward(loss)
 
         cpu = core.CPUPlace()
         exe = Executor(cpu)
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
index 7e457f987d36e52e9e7c7727b4f996ad31c6bf08..44a6e344630bb35d28ee29078bf8727053a24bef 100644
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
@@ -14,7 +14,7 @@
 
 __all__ = [
     'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn', 'xmap_readers', 'pipe_reader'
+    'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader'
 ]
 
 from threading import Thread
@@ -334,93 +334,72 @@ def _buf2lines(buf, line_break="\n"):
     return lines[:-1], lines[-1]
 
 
-def pipe_reader(left_cmd,
-                parser,
-                bufsize=8192,
-                file_type="plain",
-                cut_lines=True,
-                line_break="\n"):
+class PipeReader:
     """
-    pipe_reader read data by stream from a command, take it's 
-    stdout into a pipe buffer and redirect it to the parser to
-    parse, then yield data as your desired format.
+        PipeReader read data by stream from a command, take it's 
+        stdout into a pipe buffer and redirect it to the parser to
+        parse, then yield data as your desired format.
 
-    You can using standard linux command or call another program
-    to read data, from HDFS, Ceph, URL, AWS S3 etc:
+        You can using standard linux command or call another program
+        to read data, from HDFS, Ceph, URL, AWS S3 etc:
 
-    cmd = "hadoop fs -cat /path/to/some/file"
-    cmd = "cat sample_file.tar.gz"
-    cmd = "curl http://someurl"
-    cmd = "python print_s3_bucket.py"
+        .. code-block:: python
+           cmd = "hadoop fs -cat /path/to/some/file"
+           cmd = "cat sample_file.tar.gz"
+           cmd = "curl http://someurl"
+           cmd = "python print_s3_bucket.py"
 
-    A sample parser:
+        An example:
+
+        .. code-block:: python
     
-    def sample_parser(lines):
-        # parse each line as one sample data,
-        # return a list of samples as batches.
-        ret = []
-        for l in lines:
-            ret.append(l.split(" ")[1:5])
-        return ret
-
-    :param left_cmd: command to excute to get stdout from.
-    :type left_cmd: string
-    :param parser: parser function to parse lines of data.
-                   if cut_lines is True, parser will receive list
-                   of lines.
-                   if cut_lines is False, parser will receive a
-                   raw buffer each time.
-                   parser should return a list of parsed values.
-    :type parser: callable
-    :param bufsize: the buffer size used for the stdout pipe.
-    :type bufsize: int
-    :param file_type: can be plain/gzip, stream buffer data type.
-    :type file_type: string
-    :param cut_lines: whether to pass lines instead of raw buffer
-                      to the parser
-    :type cut_lines: bool
-    :param line_break: line break of the file, like \n or \r
-    :type line_break: string
-
-    :return: the reader generator.
-    :rtype: callable
+           def example_reader():
+               for f in myfiles:
+                   pr = PipeReader("cat %s"%f)
+                   for l in pr.get_line():
+                       sample = l.split(" ")
+                       yield sample
     """
-    if not isinstance(left_cmd, str):
-        raise TypeError("left_cmd must be a string")
-    if not callable(parser):
-        raise TypeError("parser must be a callable object")
-
-    process = subprocess.Popen(
-        left_cmd.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
-    # TODO(typhoonzero): add a thread to read stderr
-
-    # Always init a decompress object is better than
-    # create in the loop.
-    dec = zlib.decompressobj(
-        32 + zlib.MAX_WBITS)  # offset 32 to skip the header
 
-    def reader():
+    def __init__(self, command, bufsize=8192, file_type="plain"):
+        if not isinstance(command, str):
+            raise TypeError("left_cmd must be a string")
+        if file_type == "gzip":
+            self.dec = zlib.decompressobj(
+                32 + zlib.MAX_WBITS)  # offset 32 to skip the header
+        self.file_type = file_type
+        self.bufsize = bufsize
+        self.process = subprocess.Popen(
+            command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
+
+    def get_line(self, cut_lines=True, line_break="\n"):
+        """
+        :param cut_lines: cut buffer to lines
+        :type cut_lines: bool
+        :param line_break: line break of the file, like \n or \r
+        :type line_break: string
+
+        :return: one line or a buffer of bytes
+        :rtype: string
+        """
         remained = ""
         while True:
-            buff = process.stdout.read(bufsize)
+            buff = self.process.stdout.read(self.bufsize)
             if buff:
-                if file_type == "gzip":
-                    decomp_buff = dec.decompress(buff)
-                elif file_type == "plain":
+                if self.file_type == "gzip":
+                    decomp_buff = self.dec.decompress(buff)
+                elif self.file_type == "plain":
                     decomp_buff = buff
                 else:
-                    raise TypeError("file_type %s is not allowed" % file_type)
+                    raise TypeError("file_type %s is not allowed" %
+                                    self.file_type)
 
                 if cut_lines:
                     lines, remained = _buf2lines(''.join(
                         [remained, decomp_buff]), line_break)
-                    parsed_list = parser(lines)
-                    for ret in parsed_list:
-                        yield ret
+                    for line in lines:
+                        yield line
                 else:
-                    for ret in parser(decomp_buff):
-                        yield ret
+                    yield decomp_buff
             else:
                 break
-
-    return reader
diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py
index 5a92951b100fa51ab6df7039d9c6b54d1f9d963e..4ba71969dffe7447b6c5b70aeb752a4e5469fb36 100644
--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ b/python/paddle/v2/reader/tests/decorator_test.py
@@ -145,5 +145,33 @@ class TestXmap(unittest.TestCase):
                             self.assertEqual(e, mapper(idx))
 
 
+class TestPipeReader(unittest.TestCase):
+    def test_pipe_reader(self):
+        def example_reader(myfiles):
+            for f in myfiles:
+                pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128)
+                for l in pr.get_line():
+                    yield l
+
+        import tempfile
+
+        records = [str(i) for i in xrange(5)]
+        temp = tempfile.NamedTemporaryFile()
+        try:
+            with open(temp.name, 'w') as f:
+                for r in records:
+                    f.write('%s\n' % r)
+
+            result = []
+            for r in example_reader([temp.name]):
+                result.append(r)
+
+            for idx, e in enumerate(records):
+                self.assertEqual(e, result[idx])
+        finally:
+            # delete the temporary file
+            temp.close()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index 8396fb44cfcee28211b5d3db7684a4adce1fb1f6..66ccfe808763d0e157f866ce08868e3fdebdea79 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -79,8 +79,7 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
 
 # the prefix is sys.prefix which should always be usr
 paddle_bin_dir = 'opt/paddle/bin'
-paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle_usage',
-               '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
+paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
                '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
                '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main',
                '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']