diff --git a/.clang-format b/.clang-format
index 9ba433b17362424973626470d930356c2173dd84..aff93435f58c522f5ed1090aef2005f76e91cf31 100644
--- a/.clang-format
+++ b/.clang-format
@@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
 BinPackArguments: false
 ...
-
diff --git a/.copyright.hook b/.copyright.hook
new file mode 100644
index 0000000000000000000000000000000000000000..de97ce90ac4a80d8d0532ea8898385b2e5781ce0
--- /dev/null
+++ b/.copyright.hook
@@ -0,0 +1,106 @@
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import io, re
+import sys, os
+import subprocess
+import platform
+
+COPYRIGHT = '''
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+LANG_COMMENT_MARK = None
+
+NEW_LINE_MARK = None
+
+COPYRIGHT_HEADER = None
+
+if platform.system() == "Windows":
+    NEW_LINE_MARK = "\r\n"
+else:
+    NEW_LINE_MARK = '\n'
+    COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
+    p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
+    process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
+    date, err = process.communicate()
+    date = date.decode("utf-8").rstrip("\n")
+    COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
+
+
+def generate_copyright(template, lang='C'):
+    if lang == 'Python':
+        LANG_COMMENT_MARK = '#'
+    else:
+        LANG_COMMENT_MARK = "//"
+
+    lines = template.split(NEW_LINE_MARK)
+    ans = LANG_COMMENT_MARK + COPYRIGHT_HEADER + NEW_LINE_MARK
+    for lino, line in enumerate(lines):
+        if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
+        ans += LANG_COMMENT_MARK + line + NEW_LINE_MARK
+
+    return ans
+
+
+def lang_type(filename):
+    if filename.endswith(".py"):
+        return "Python"
+    elif filename.endswith(".h"):
+        return "C"
+    elif filename.endswith(".hpp"):
+        return "C"
+    elif filename.endswith(".cc"):
+        return "C"
+    elif filename.endswith(".cpp"):
+        return "C"
+    elif filename.endswith(".cu"):
+        return "C"
+    elif filename.endswith(".cuh"):
+        return "C"
+    elif filename.endswith(".go"):
+        return "C"
+    elif filename.endswith(".proto"):
+        return "C"
+    else:
+        print("Unsupported filetype")
+        exit(0)
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(
+        description='Checker for copyright declaration.')
+    parser.add_argument('filenames', nargs='*', help='Filenames to check')
+    args = parser.parse_args(argv)
+
+    retv = 0
+    for filename in args.filenames:
+        first_line = io.open(filename).readline()
+        if "COPYRIGHT" in first_line.upper() : continue
+        original_contents = io.open(filename).read()
+        new_contents = generate_copyright(
+            COPYRIGHT, lang_type(filename)) + original_contents
+        print('Auto Insert Copyright Header {}'.format(filename))
+        retv = 1
+        with io.open(filename, 'w') as output_file:
+            output_file.write(new_contents)
+
+    return retv
+
+
+if __name__ == '__main__':
+    exit(main())
diff --git a/.gitignore b/.gitignore
index 020d3f0c303f7d850f4ec9c0efe58ab2d57dce2e..ac56a3320ec85769d2c87c072512f5217eca0c24 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
 paddle/pybind/pybind.h
+python/paddle/version.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 59661c9c1da53a2ddac0127ed1827fedde811a1d..89c620bb2f7ef634fa80b64eec7037e8cb9a190c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,3 +31,11 @@
     -   id: go-fmt
         types:
         - go
+-   repo: local
+    hooks:
+    -   id: copyright_checker
+        name: copyright_checker
+        entry: python ./.copyright.hook
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
+        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
diff --git a/.travis.yml b/.travis.yml
index c51e02eb79a9e53a2b8d1d663e8f0c3e0d8c3a61..e2d49daa1981396628efa5d16459eb70e9e76884 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -42,7 +42,7 @@ before_install:
 script:
   - |
     timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
   - |
     if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
     if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65164b8472b902be8b0b9d5fb99807d012b8a666..00996cb7ed5cc573c42b69be6db369c3654d6d1a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,10 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 include(system)
 
 project(paddle CXX C Go)
+message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
+        "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
+message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
+        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 
 find_package(Sphinx)
 if(NOT CMAKE_CROSSCOMPILING)
@@ -54,7 +58,9 @@ option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
+option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
+option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -67,9 +73,6 @@ if(ANDROID OR IOS)
     if(ANDROID)
         if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
             message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
-        elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
-            # TODO: support glog for Android api 16 ~ 19 in the future
-            message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
         endif()
     endif()
 
@@ -83,6 +86,8 @@ if(ANDROID OR IOS)
         "Disable RDMA when cross-compiling for Android and iOS" FORCE)
     set(WITH_MKL OFF CACHE STRING
         "Disable MKL when cross-compiling for Android and iOS" FORCE)
+    set(WITH_GOLANG OFF CACHE STRING
+        "Disable golang when cross-compiling for Android and iOS" FORCE)
 
     # Compile PaddlePaddle mobile inference library
     if (NOT WITH_C_API)
@@ -133,6 +138,8 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/nccl)
+include(external/cares)
+include(external/grpc)
 
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
@@ -194,6 +201,10 @@ if(WITH_GOLANG)
 endif(WITH_GOLANG)
 
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
+
+SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+
 add_subdirectory(paddle)
 if(WITH_PYTHON)
   add_subdirectory(python)
diff --git a/Dockerfile b/Dockerfile
index 150344a8116e2be9b5bab8e5fdcc9c37f4025020..857d3f3e5f64791146741ffb29feabfcb2ecbb84 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,7 +29,7 @@ RUN apt-get update && \
     automake locales clang-format swig doxygen cmake  \
     liblapack-dev liblapacke-dev libboost-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools && \
+    net-tools libtool && \
     apt-get clean -y
 
 # Install Go and glide
diff --git a/README.md b/README.md
index db0fbd88b250cdc2a3cc77521cc1c2cea77c6e87..d06375a444dd65675bdd75baccf8445c1638a87c 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/doc_cn/)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -36,7 +36,8 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
     examples:
 
       - Optimized math operations through SSE/AVX intrinsics, BLAS libraries
-      (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
+      (e.g. MKL, OpenBLAS, cuBLAS) or customized CPU/GPU kernels.
+      - Optimized CNN networks through MKL-DNN library.
       - Highly optimized recurrent networks which can handle **variable-length**
       sequence without padding.
       - Optimized local and distributed training for models with high dimensional
@@ -61,32 +62,32 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 ## Installation
 
 It is recommended to check out the
-[Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html).
+[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/build_from_source_en.html).
 
 ## Documentation
 
-We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
-[Chinese](http://doc.paddlepaddle.org/doc_cn/) documentation.
+We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and
+[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation.
 
-- [Deep Learning 101](http://book.paddlepaddle.org/index.html)
+- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/cluster_train_en.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Distributed Training on Kubernetes](http://doc.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/k8s_en.html)
 
    You can also run distributed training jobs on Kubernetes clusters.
 
-- [Python API](http://doc.paddlepaddle.org/develop/doc/api/index_en.html)
+- [Python API](http://www.paddlepaddle.org/docs/develop/documentation/en/api/index_en.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://doc.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html)
 
    We appreciate your contributions!
 
diff --git a/RELEASE.cn.md b/RELEASE.cn.md
index 5deaf230a8f5dd3089993f0fc79b9460fd049750..494c59730dd3c2830514e8924aa3d59a34ac412e 100644
--- a/RELEASE.cn.md
+++ b/RELEASE.cn.md
@@ -1,3 +1,62 @@
+# v0.11.0版本
+
+## PaddlePaddle Fluid
+
+- PaddlePaddle发布版本v0.11.0包含一个新的特性*PaddlePaddle Fluid*. Fluid 是设计用来让用户像Pytorch和Tensorflow Eager Execution一样执行程序。在这些系统中，不再有*模型*这个概念，应用也不再包含一个用于描述Operator图或者一系列层的符号描述，而是像通用程序那样描述训练或者预测的过程。而Fluid与PyTorch或Eager Execution的区别在于Fluid不依赖Python提供的控制流，例如 if-else-then或者for，而是提供了基于C++实现的控制流并暴露了对应的用with语法实现的Python接口。例如：
+
+  https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44
+
+- 在v0.11.0版本中，我们提供了一个C++类`Executor`用于运行一个Fluid程序。Executor类似一个解释器。在未来的版本中，我们将提升和优化Executor成为一个调试器，就像GDB。并可能提供一些编译器，这个编译器会读取一个上文所描述的应用然后编译成一个等价的
+源代码，这个源代码可以被nvcc编译成可以使用CUDA的二进制，或者被icc编译成可以充分利用Intel CPU的二进制。
+
+
+## 新特点
+
+* 发布 `PaddlePaddle Fluid`。
+* 增加了用于模型预测的C-API。
+* 用Fluid API实现了一个简单的GAN的例子。
+* 增加了关于性能调优的文档。
+* 为`paddle.v2.dataset`下载数据集提供了重试机制.
+* C++中使用protobuf-lite替换protobuf减少了二进制的大小。
+* 发布了新特性 [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment).
+* 基于Bazel API利用cmake实现了一个的新的构建系统函数库。
+* 当使用编译选项`WITH_MKL=ON`时自动下载和编译Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) 函数库.
+* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn):
+  - 完成了 11个 MKL-DNN 层: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN。
+  - 完成了 3个 MKL-DNN 网络: VGG-19, ResNet-50, GoogleNet
+  - 基于Intel Skylake 6148 CPU的[性能测试](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) : 相对于MKLML有2~3倍的训练加速。
+* 增加 [softsign activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign)
+* 增加 [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod)
+* 增加 [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance)
+* 增加 [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq)
+* 增加 [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score)
+* 增加 [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice)
+* 增加 [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv)
+* 增加移动端友好的网页
+
+## 改进
+
+* 使用一个Python`whl`包即可安装.
+* [V2 API可以实现用户定制化评估](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标)。
+* 将 `PADDLE_ONLY_CPU` 改为 `PADDLE_WITH_GPU`, 因为我们会支持多种设备。
+* 删除了有一些bug的BarrierStat。
+* 清理和删除了paddle::Parameter中未使用的函数。
+* 删除了ProtoDataProvider。
+* Huber loss同时支持回归和分类。
+* 为sequence pooling 层增加`stride`参数。
+* v2 API自动使用cudnn batch normalization。
+* 可以使用一个固定的参数名共享BN层的参数。
+* 2D convolution operation支持variable-dimension input特性。
+* 重构cmake中关于CUDA的部分并实现自动检测GPU架构的功能。
+* 优化网页导航。
+
+## 错误修复
+
+* 修复ROI pooling的Bug. cc9a761
+* 修复当label是dense vector是AUC变成0的问题. #5274
+* 修复WarpCTC 层的Bug.
+
+
 # v0.10.0版本
 
 我们非常高兴发布了PaddlePaddle V0.10.0版，并开发了新的[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。
diff --git a/RELEASE.md b/RELEASE.md
index 146f7afa7dfbc152500b82fde28445ae3155c16c..5a62c955131007c9f3329d162c20d1b462550019 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,75 @@
+# Release v0.11.0
+
+## PaddlePaddle Fluid
+
+- Release 0.11.0 includes a new feature *PaddlePaddle Fluid*.  Fluid is
+  designed to allow users to program like PyTorch and TensorFlow Eager Execution.
+  In these systems, there is no longer the concept *model* and applications
+  do not include a symbolic description of a graph of operators nor a sequence
+  of layers. Instead, applications look exactly like a usual program that
+  describes a process of training or inference.  The difference between
+  Fluid and PyTorch or Eager Execution is that Fluid doesn't rely on Python's
+  control-flow, `if-then-else` nor `for`.  Instead, Fluid provides its
+  C++ implementations and their Python binding using the `with` statement.  For an example
+
+  https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44
+
+- In 0.11.0, we provides a C++ class `Executor` to run a Fluid program.
+Executor works like an interpreter. In future version, we will improve
+`Executor` into a debugger like GDB, and we might provide some compilers,
+which, for example, takes an application like the above one, and outputs
+an equivalent C++ source program, which can be compiled using
+[`nvcc`](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html)
+to generate binaries that use CUDA, or using
+[`icc`](https://software.intel.com/en-us/c-compilers) to generate binaries
+that make full use of Intel CPUs.
+
+## New Features
+
+* Release `PaddlePaddle Fluid`.
+* Add C-API for model inference
+* Use fluid API to create a simple GAN demo.
+* Add develop guide about performance tunning.
+* Add retry when download `paddle.v2.dataset`.
+* Linking protobuf-lite not protobuf in C++. Reduce the binary size.
+* Feature [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment) released.
+* A new style cmake functions for Paddle. It is based on Bazel API.
+* Automatically download and compile with Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) library as CBLAS when build `WITH_MKL=ON`.
+* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn):
+  - Complete 11 MKL-DNN layers: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN.
+  - Complete 3 MKL-DNN networks: VGG-19, ResNet-50, GoogleNet
+  - [Benchmark](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) on Intel Skylake 6148 CPU: 2~3x training speedup compared with MKLML.
+* Add the [`softsign` activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign).
+* Add the [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod).
+* Add the [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance).
+* Add the [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq).
+* Add the [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score).
+* Add the [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice).
+* Add the [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv)
+* Add mobile friendly webpages.
+
+## Improvements
+
+* Build and install using a single `whl` package.
+* [Custom evaluating in V2 API](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标).
+* Change `PADDLE_ONLY_CPU` to `PADDLE_WITH_GPU`, since we will support many kinds of devices.
+* Remove buggy BarrierStat.
+* Clean and remove unused functions in paddle::Parameter.
+* Remove ProtoDataProvider.
+* Huber loss supports both regression and classification.
+* Add the `stride` parameter  for sequence pooling layers.
+* Enable v2 API use cudnn batch normalization automatically.
+* The BN layer's parameter can be shared by a fixed the parameter name.
+* Support variable-dimension input feature for 2D convolution operation.
+* Refine cmake about CUDA to automatically detect GPU architecture.
+* Improved website navigation.
+
+## Bug Fixes
+
+* Fix bug in ROI pooling. cc9a761
+* Fix AUC is zero when label is dense vector. #5274
+* Fix bug in WarpCTC layer.
+
 # Release v0.10.0
 
 We are glad to release version 0.10.0.  In this version, we are happy to release the new 
diff --git a/adversarial/README.md b/adversarial/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..51da21918a9d6e2192a2e03eabef4fde97896bc5
--- /dev/null
+++ b/adversarial/README.md
@@ -0,0 +1,9 @@
+# Advbox
+
+Advbox is a Python toolbox to create adversarial examples that fool neural networks. It requires Python and paddle.
+
+## How to use
+
+1. train a model and save it's parameters. (like fluid_mnist.py)
+2. load the parameters which is trained in step1, then reconstruct the model.(like mnist_tutorial_fgsm.py)
+3. use advbox to generate the adversarial sample.
diff --git a/adversarial/advbox/__init__.py b/adversarial/advbox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f56f14f18dafdfe1e712cea178a63f09a087b587
--- /dev/null
+++ b/adversarial/advbox/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+   A set of tools for generating adversarial example on paddle platform 
+"""
diff --git a/adversarial/advbox/attacks/base.py b/adversarial/advbox/attacks/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..000baa48f626c7dddce49502d20c499f6424cd06
--- /dev/null
+++ b/adversarial/advbox/attacks/base.py
@@ -0,0 +1,52 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+"""
+The base model of the model.
+"""
+from abc import ABCMeta, abstractmethod
+
+
+class Attack(object):
+    """
+    Abstract base class for adversarial attacks. `Attack` represent an adversarial attack
+    which search an adversarial example. subclass should implement the _apply() method.
+
+    Args:
+        model(Model): an instance of the class advbox.base.Model.
+
+    """
+    __metaclass__ = ABCMeta
+
+    def __init__(self, model):
+        self.model = model
+
+    def __call__(self, image_label):
+        """
+        Generate the adversarial sample.
+
+        Args:
+        image_label(list): The image and label tuple list with one element.
+        """
+        adv_img = self._apply(image_label)
+        return adv_img
+
+    @abstractmethod
+    def _apply(self, image_label):
+        """
+        Search an adversarial example.
+
+        Args:
+        image_batch(list): The image and label tuple list with one element.
+        """
+        raise NotImplementedError
diff --git a/adversarial/advbox/attacks/gradientsign.py b/adversarial/advbox/attacks/gradientsign.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc26ffb69020a87f559c537f03de84f7c2bea2de
--- /dev/null
+++ b/adversarial/advbox/attacks/gradientsign.py
@@ -0,0 +1,87 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+"""
+This module provide the attack method for FGSM's implement.
+"""
+from __future__ import division
+import numpy as np
+from collections import Iterable
+from .base import Attack
+
+
+class GradientSignAttack(Attack):
+    """
+    This attack was originally implemented by Goodfellow et al. (2015) with the
+    infinity norm (and is known as the "Fast Gradient Sign Method"). This is therefore called
+    the Fast Gradient Method.
+    Paper link: https://arxiv.org/abs/1412.6572
+    """
+
+    def _apply(self, image_label, epsilons=1000):
+        assert len(image_label) == 1
+        pre_label = np.argmax(self.model.predict(image_label))
+
+        min_, max_ = self.model.bounds()
+        gradient = self.model.gradient(image_label)
+        gradient_sign = np.sign(gradient) * (max_ - min_)
+
+        if not isinstance(epsilons, Iterable):
+            epsilons = np.linspace(0, 1, num=epsilons + 1)
+
+        for epsilon in epsilons:
+            adv_img = image_label[0][0].reshape(
+                gradient_sign.shape) + epsilon * gradient_sign
+            adv_img = np.clip(adv_img, min_, max_)
+            adv_label = np.argmax(self.model.predict([(adv_img, 0)]))
+            if pre_label != adv_label:
+                return adv_img
+
+
+FGSM = GradientSignAttack
+
+
+class IteratorGradientSignAttack(Attack):
+    """
+    This attack was originally implemented by Alexey Kurakin(Google Brain).
+    Paper link: https://arxiv.org/pdf/1607.02533.pdf
+    """
+
+    def _apply(self, image_label, epsilons=100, steps=10):
+        """
+        Apply the iterative gradient sign attack.
+        Args:
+            image_label(list): The image and label tuple list of one element.
+            epsilons(list|tuple|int): The epsilon (input variation parameter).
+            steps(int): The number of iterator steps.
+        Return:
+            numpy.ndarray: The adversarail sample generated by the algorithm.
+        """
+        assert len(image_label) == 1
+        pre_label = np.argmax(self.model.predict(image_label))
+        gradient = self.model.gradient(image_label)
+        min_, max_ = self.model.bounds()
+
+        if not isinstance(epsilons, Iterable):
+            epsilons = np.linspace(0, 1, num=epsilons + 1)
+
+        for epsilon in epsilons:
+            adv_img = image_label[0][0].reshape(gradient.shape)
+            for _ in range(steps):
+                gradient = self.model.gradient([(adv_img, image_label[0][1])])
+                gradient_sign = np.sign(gradient) * (max_ - min_)
+                adv_img = adv_img + epsilon * gradient_sign
+                adv_img = np.clip(adv_img, min_, max_)
+                adv_label = np.argmax(self.model.predict([(adv_img, 0)]))
+                if pre_label != adv_label:
+                    return adv_img
diff --git a/adversarial/advbox/models/__init__.py b/adversarial/advbox/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eee0f6efd4774b42fcd082eb06d1398d2ee51bc4
--- /dev/null
+++ b/adversarial/advbox/models/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Paddle model for target of attack 
+"""
diff --git a/adversarial/advbox/models/base.py b/adversarial/advbox/models/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..084e563f7b423f54f350b90649b04acc17b2db97
--- /dev/null
+++ b/adversarial/advbox/models/base.py
@@ -0,0 +1,103 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+"""
+The base model of the model.
+"""
+from abc import ABCMeta
+import abc
+
+abstractmethod = abc.abstractmethod
+
+
+class Model(object):
+    """
+    Base class of model to provide attack.
+
+
+    Args:
+        bounds(tuple): The lower and upper bound for the image pixel.
+        channel_axis(int): The index of the axis that represents the color channel.
+        preprocess(tuple): Two element tuple used to preprocess the input. First
+            substract the first element, then divide the second element.
+    """
+    __metaclass__ = ABCMeta
+
+    def __init__(self, bounds, channel_axis, preprocess=None):
+        assert len(bounds) == 2
+        assert channel_axis in [0, 1, 2, 3]
+
+        if preprocess is None:
+            preprocess = (0, 1)
+        self._bounds = bounds
+        self._channel_axis = channel_axis
+        self._preprocess = preprocess
+
+    def bounds(self):
+        """
+        Return the upper and lower bounds of the model.
+        """
+        return self._bounds
+
+    def channel_axis(self):
+        """
+        Return the channel axis of the model.
+        """
+        return self._channel_axis
+
+    def _process_input(self, input_):
+        res = input_
+        sub, div = self._preprocess
+        if sub != 0:
+            res = input_ - sub
+        assert div != 0
+        if div != 1:
+            res /= div
+        return res
+
+    @abstractmethod
+    def predict(self, image_batch):
+        """
+        Calculate the prediction of the image batch.
+
+        Args:
+            image_batch(numpy.ndarray): image batch of shape (batch_size, height, width, channels).
+
+        Return:
+            numpy.ndarray: predictions of the images with shape (batch_size, num_of_classes).
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def num_classes(self):
+        """
+        Determine the number of the classes
+
+        Return:
+            int: the number of the classes
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def gradient(self, image_batch):
+        """
+        Calculate the gradient of the cross-entropy loss w.r.t the image.
+
+        Args:
+            image_batch(list): The image and label tuple list.
+
+        Return:
+            numpy.ndarray: gradient of the cross-entropy loss w.r.t the image with
+                the shape (height, width, channel).
+        """
+        raise NotImplementedError
diff --git a/adversarial/advbox/models/paddle.py b/adversarial/advbox/models/paddle.py
new file mode 100644
index 0000000000000000000000000000000000000000..4048b47f897000c1b004cb05f8bbca985d5bbbb8
--- /dev/null
+++ b/adversarial/advbox/models/paddle.py
@@ -0,0 +1,114 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+from paddle.v2.fluid.framework import program_guard
+
+from .base import Model
+
+
+class PaddleModel(Model):
+    """
+    Create a PaddleModel instance.
+    When you need to generate a adversarial sample, you should construct an instance of PaddleModel.
+
+    Args:
+        program(paddle.v2.fluid.framework.Program): The program of the model which generate the adversarial sample.
+        input_name(string): The name of the input.
+        logits_name(string): The name of the logits.
+        predict_name(string): The name of the predict.
+        cost_name(string): The name of the loss in the program.
+    """
+
+    def __init__(self,
+                 program,
+                 input_name,
+                 logits_name,
+                 predict_name,
+                 cost_name,
+                 bounds,
+                 channel_axis=3,
+                 preprocess=None):
+        super(PaddleModel, self).__init__(
+            bounds=bounds, channel_axis=channel_axis, preprocess=preprocess)
+
+        if preprocess is None:
+            preprocess = (0, 1)
+
+        self._program = program
+        self._place = fluid.CPUPlace()
+        self._exe = fluid.Executor(self._place)
+
+        self._input_name = input_name
+        self._logits_name = logits_name
+        self._predict_name = predict_name
+        self._cost_name = cost_name
+
+        # gradient
+        loss = self._program.block(0).var(self._cost_name)
+        param_grads = fluid.backward.append_backward(
+            loss, parameter_list=[self._input_name])
+        self._gradient = dict(param_grads)[self._input_name]
+
+    def predict(self, image_batch):
+        """
+            Predict the label of the image_batch.
+
+            Args:
+                image_batch(list): The image and label tuple list.
+            Return:
+                numpy.ndarray: predictions of the images with shape (batch_size, num_of_classes).
+        """
+        feeder = fluid.DataFeeder(
+            feed_list=[self._input_name, self._logits_name],
+            place=self._place,
+            program=self._program)
+        predict_var = self._program.block(0).var(self._predict_name)
+        predict = self._exe.run(self._program,
+                                feed=feeder.feed(image_batch),
+                                fetch_list=[predict_var])
+        return predict
+
+    def num_classes(self):
+        """
+            Calculate the number of classes of the output label. 
+
+        Return:
+            int: the number of classes
+        """
+        predict_var = self._program.block(0).var(self._predict_name)
+        assert len(predict_var.shape) == 2
+        return predict_var.shape[1]
+
+    def gradient(self, image_batch):
+        """
+        Calculate the gradient of the loss w.r.t the input.
+
+        Args:
+            image_batch(list): The image and label tuple list.
+        Return:
+            list: The list of the gradient of the image.
+        """
+        feeder = fluid.DataFeeder(
+            feed_list=[self._input_name, self._logits_name],
+            place=self._place,
+            program=self._program)
+
+        grad, = self._exe.run(self._program,
+                              feed=feeder.feed(image_batch),
+                              fetch_list=[self._gradient])
+        return grad
diff --git a/adversarial/fluid_mnist.py b/adversarial/fluid_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8c7fe8d0ef6a6b0756ef73c14d8937b9cd1a738
--- /dev/null
+++ b/adversarial/fluid_mnist.py
@@ -0,0 +1,99 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+"""
+CNN on mnist data using fluid api of paddlepaddle
+"""
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+
+def mnist_cnn_model(img):
+    """
+    Mnist cnn model
+
+    Args:
+        img(Varaible): the input image to be recognized
+
+    Returns:
+        Variable: the label prediction
+    """
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        num_filters=20,
+        filter_size=5,
+        pool_size=2,
+        pool_stride=2,
+        act='relu')
+
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        num_filters=50,
+        filter_size=5,
+        pool_size=2,
+        pool_stride=2,
+        act='relu')
+
+    logits = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    return logits
+
+
+def main():
+    """
+    Train the cnn model on mnist datasets
+    """
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    logits = mnist_cnn_model(img)
+    cost = fluid.layers.cross_entropy(input=logits, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+    optimizer.minimize(avg_cost)
+
+    accuracy = fluid.evaluator.Accuracy(input=logits, label=label)
+
+    BATCH_SIZE = 50
+    PASS_NUM = 3
+    ACC_THRESHOLD = 0.98
+    LOSS_THRESHOLD = 10.0
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in range(PASS_NUM):
+        accuracy.reset(exe)
+        for data in train_reader():
+            loss, acc = exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+            pass_acc = accuracy.eval(exe)
+            print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc="
+                  + str(pass_acc))
+            if loss < LOSS_THRESHOLD and pass_acc > ACC_THRESHOLD:
+                break
+
+        pass_acc = accuracy.eval(exe)
+        print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
+    fluid.io.save_params(
+        exe, dirname='./mnist', main_program=fluid.default_main_program())
+    print('train mnist done')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adversarial/mnist_tutorial_fgsm.py b/adversarial/mnist_tutorial_fgsm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c63e030cd826abe24eacab21394c612b7c2d9495
--- /dev/null
+++ b/adversarial/mnist_tutorial_fgsm.py
@@ -0,0 +1,100 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+"""
+FGSM demos on mnist using advbox tool.
+"""
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import matplotlib.pyplot as plt
+import numpy as np
+
+from advbox.models.paddle import PaddleModel
+from advbox.attacks.gradientsign import GradientSignAttack
+
+
+def cnn_model(img):
+    """
+    Mnist cnn model
+    Args:
+        img(Varaible): the input image to be recognized
+    Returns:
+        Variable: the label prediction
+    """
+    #conv1 = fluid.nets.conv2d()
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        num_filters=20,
+        filter_size=5,
+        pool_size=2,
+        pool_stride=2,
+        act='relu')
+
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        num_filters=50,
+        filter_size=5,
+        pool_size=2,
+        pool_stride=2,
+        act='relu')
+
+    logits = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    return logits
+
+
+def main():
+    """
+    Advbox demo which demonstrate how to use advbox.
+    """
+    IMG_NAME = 'img'
+    LABEL_NAME = 'label'
+
+    img = fluid.layers.data(name=IMG_NAME, shape=[1, 28, 28], dtype='float32')
+    # gradient should flow
+    img.stop_gradient = False
+    label = fluid.layers.data(name=LABEL_NAME, shape=[1], dtype='int64')
+    logits = cnn_model(img)
+    cost = fluid.layers.cross_entropy(input=logits, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    BATCH_SIZE = 1
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+    feeder = fluid.DataFeeder(
+        feed_list=[IMG_NAME, LABEL_NAME],
+        place=place,
+        program=fluid.default_main_program())
+
+    fluid.io.load_params(
+        exe, "./mnist/", main_program=fluid.default_main_program())
+
+    # advbox demo
+    m = PaddleModel(fluid.default_main_program(), IMG_NAME, LABEL_NAME,
+                    logits.name, avg_cost.name, (-1, 1))
+    att = GradientSignAttack(m)
+    for data in train_reader():
+        # fgsm attack
+        adv_img = att(data)
+        plt.imshow(n[0][0], cmap='Greys_r')
+        plt.show()
+        #np.save('adv_img', adv_img)
+        break
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
index 040f5ffa41968cbf93a817faa1db86c18956341e..8b7dc5b7db800896eb4de2054ab5e584aed93999 100644
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -2,28 +2,27 @@
 
 Machine:
 
-- Server
- 	- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
-- Laptop
- 	- DELL XPS15-9560-R1745: i7-7700HQ 8G 256GSSD
- 	- i5 MacBook Pro (Retina, 13-inch, Early 2015)
-- Desktop
- 	- i7-6700k
+- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
+- Laptop: TBD
 
 System: CentOS release 6.3 (Final), Docker 1.12.1.
 
-PaddlePaddle: paddlepaddle/paddle:latest (TODO: will rerun after 0.11.0)
-
-- MKL-DNN tag v0.10
-- MKLML 2018.0.20170720
-- OpenBLAS v0.2.20
+PaddlePaddle:
+- paddlepaddle/paddle:0.11.0 (for MKLML and MKL-DNN)
+  - MKL-DNN tag v0.11
+  - MKLML 2018.0.1.20171007
+- paddlepaddle/paddle:0.11.0-openblas (for OpenBLAS)
+  - OpenBLAS v0.2.20
 	 
 On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
 
 ## Benchmark Model
 
 ### Server
+
+#### Training
 Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+Pay attetion that the speed below includes forward, backward and parameter update time. So we can not directly compare the data with the benchmark of caffe `time` [command](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/caffe/image/run.sh#L9), which only contain forward and backward. The updating time of parameter would become very heavy when the weight size are large, especially on alexnet.
 
 Input image size - 3 * 224 * 224, Time: images/second
 
@@ -31,18 +30,83 @@ Input image size - 3 * 224 * 224, Time: images/second
 
 | BatchSize    | 64    | 128  | 256     |
 |--------------|-------| -----| --------|
-| OpenBLAS     | 7.82  | 8.62  | 10.34  | 
-| MKLML        | 11.02 | 12.86 | 15.33  |
-| MKL-DNN      | 27.69 | 28.8 | 29.27  |
+| OpenBLAS     | 7.80  | 9.00  | 10.80  | 
+| MKLML        | 12.12 | 13.70 | 16.18  |
+| MKL-DNN      | 28.46 | 29.83 | 30.44  |
 
+<img src="figs/vgg-cpu-train.png" width="500">
 
-chart on batch size 128
-TBD
+ - ResNet-50
+
+| BatchSize    | 64    | 128   | 256    |
+|--------------|-------| ------| -------|
+| OpenBLAS     | 25.22 | 25.68 | 27.12  | 
+| MKLML        | 32.52 | 31.89 | 33.12  |
+| MKL-DNN      | 81.69 | 82.35 | 84.08  |
+
+<img src="figs/resnet-cpu-train.png" width="500">
 
- - ResNet
  - GoogLeNet
 
+| BatchSize    | 64    | 128   | 256    |
+|--------------|-------| ------| -------|
+| OpenBLAS     | 89.52 | 96.97 | 108.25 | 
+| MKLML        | 128.46| 137.89| 158.63 |
+| MKL-DNN      | 250.46| 264.83| 269.50 |
+
+<img src="figs/googlenet-cpu-train.png" width="500">
+
+- AlexNet
+
+| BatchSize    | 64     | 128    | 256    |
+|--------------|--------| ------ | -------|
+| OpenBLAS     | 45.62  | 72.79  | 107.22 | 
+| MKLML        | 66.37  | 105.60 | 144.04 |
+| MKL-DNN      | 399.00 | 498.94 | 626.53 | 
+
+<img src="figs/alexnet-cpu-train.png" width="500">
+
+#### Inference
+Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+- VGG-19
+
+| BatchSize | 1     | 2     | 4     | 8     | 16    |
+|-----------|-------|-------|-------|-------|-------|
+| OpenBLAS  | 1.10  | 1.96  | 3.62  | 3.63  | 2.25  |
+| MKLML     | 5.58  | 9.80  | 15.15 | 21.21 | 28.67 |
+| MKL-DNN   | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
+
+<img src="figs/vgg-cpu-infer.png" width="500">
+
+- ResNet-50
+
+| BatchSize | 1     | 2      | 4      | 8      | 16     |
+|-----------|-------|--------|--------|--------|--------|
+| OpenBLAS  | 3.31  | 6.72   | 11.59  | 13.17  | 9.27   |
+| MKLML     | 6.33  | 12.02  | 22.88  | 40.53  | 63.09  |
+| MKL-DNN   | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
+
+<img src="figs/resnet-cpu-infer.png" width="500">
+
+- GoogLeNet
+
+| BatchSize | 1      | 2      | 4      | 8      | 16     |
+|-----------|--------|--------|--------|--------|--------|
+| OpenBLAS  | 12.06  | 23.56  | 34.48  | 36.45  | 23.12  |
+| MKLML     | 22.74  | 41.56  | 81.22  | 133.47 | 210.53 |
+| MKL-DNN   | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
+
+<img src="figs/googlenet-cpu-infer.png" width="500">
+
+- AlexNet
+
+| BatchSize | 1      | 2      | 4      | 8      | 16     |
+|-----------|--------|--------|--------|--------|--------|
+| OpenBLAS  | 3.53   | 6.23   | 15.04  | 26.06  | 31.62  |
+| MKLML     | 21.32  | 36.55  | 73.06  | 131.15 | 192.77 |
+| MKL-DNN   | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
+
+<img src="figs/alexnet-cpu-infer.png" width="500">
+
 ### Laptop
 TBD
-### Desktop
-TBD
diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b619613ea7a5b6e940ec735314e8e47338b2c600
--- /dev/null
+++ b/benchmark/cluster/README.md
@@ -0,0 +1,78 @@
+# Cluster Training Benchmark
+
+## Setup
+
+- Platform
+  - Kubernetes: v1.6.2
+  - Linux Kernel: v3.10.0
+
+- Resource
+  - CPU: 10 Cores per Pod
+  - Memory: 5GB per Pod
+
+- Docker Image
+
+  We use different base Docker Image to run the benchmark on Kubernetes:
+  - PaddlePaddle v2: paddlepaddle/paddle:0.11.0
+  - PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
+  - TensorFlow: tensorflow/tensorflow:1.5.0-rc0
+
+- Model
+  vgg16 is used in this benchmark.
+
+## Cases
+
+- Variable
+  - Batch Size of training data.
+  - PServer count of the training job.
+  - The number of trainers.
+
+- Invariant
+  - The resource of trainer/pserver Pod.
+
+### Measure the Performance for Different Batch Size
+
+- PServer Count: 40
+- Trainer Count: 100
+- Metrics: mini-batch / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+### Measure the Performance for Different PServer Count
+
+- Trainer Count: 100
+- Batch Size: 64
+- Metrics: mini-batch / sec
+
+| PServer Count | 10 | 20 | 40 | 60 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+### Measure Parallel Efficiency By Increasing Trainer Count
+
+- PServer Count: 20
+- Batch Size: 64
+- Metrics:
+
+$S = \div(T1, TN)$
+
+which S is the ratio of T1 over TN, training time of 1 and N trainers.
+The parallel efficiency is:
+
+$E = \div(S, N)$
+
+| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
+| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - |
+| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - |
+
+## Reproduce the benchmark
+
+TODO
diff --git a/benchmark/figs/alexnet-cpu-infer.png b/benchmark/figs/alexnet-cpu-infer.png
new file mode 100644
index 0000000000000000000000000000000000000000..6215ae4e4288f969a909c258ddd5b5f51e6abb3f
Binary files /dev/null and b/benchmark/figs/alexnet-cpu-infer.png differ
diff --git a/benchmark/figs/alexnet-cpu-train.png b/benchmark/figs/alexnet-cpu-train.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3200bbc049a9d75857fb5692902d7b475aa8f68
Binary files /dev/null and b/benchmark/figs/alexnet-cpu-train.png differ
diff --git a/benchmark/figs/googlenet-cpu-infer.png b/benchmark/figs/googlenet-cpu-infer.png
new file mode 100644
index 0000000000000000000000000000000000000000..19478d433bae651f4506153ded11a96d5137b409
Binary files /dev/null and b/benchmark/figs/googlenet-cpu-infer.png differ
diff --git a/benchmark/figs/googlenet-cpu-train.png b/benchmark/figs/googlenet-cpu-train.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e86e058d0654d02c898bf7f5fe73aa1c7614e20
Binary files /dev/null and b/benchmark/figs/googlenet-cpu-train.png differ
diff --git a/benchmark/figs/resnet-cpu-infer.png b/benchmark/figs/resnet-cpu-infer.png
new file mode 100644
index 0000000000000000000000000000000000000000..bc43d4b8d20c600d6f1046a5986a6c62adfa6b44
Binary files /dev/null and b/benchmark/figs/resnet-cpu-infer.png differ
diff --git a/benchmark/figs/resnet-cpu-train.png b/benchmark/figs/resnet-cpu-train.png
new file mode 100644
index 0000000000000000000000000000000000000000..96746b1759fa17d25ac5f40ed3678e16086364ba
Binary files /dev/null and b/benchmark/figs/resnet-cpu-train.png differ
diff --git a/benchmark/figs/vgg-cpu-infer.png b/benchmark/figs/vgg-cpu-infer.png
new file mode 100644
index 0000000000000000000000000000000000000000..3a51ec6c474f0e0f0c4384c8ccd1e08c4382230b
Binary files /dev/null and b/benchmark/figs/vgg-cpu-infer.png differ
diff --git a/benchmark/figs/vgg-cpu-train.png b/benchmark/figs/vgg-cpu-train.png
new file mode 100644
index 0000000000000000000000000000000000000000..6d548cfd59f86f8166c011d71ebde4e4b33ef644
Binary files /dev/null and b/benchmark/figs/vgg-cpu-train.png differ
diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py
index 3358d43a4b08c6a9b89d59e1a8be53ee1f12bbe0..07f478d8fa4e1ac4c584d4c410f75555c7926d4b 100644
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/env python
 
 from paddle.trainer_config_helpers import *
@@ -6,10 +19,24 @@ height = 227
 width = 227
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
+gp = get_config_arg('layer_num', int, 1)
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
 
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer,
+    'num_samples': num_samples
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 
 settings(
     batch_size=batch_size,
@@ -31,7 +58,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2)
 
 # conv2
 net = img_conv_layer(
-    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
 net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
 net = img_pool_layer(input=net, pool_size=3, stride=2)
 
@@ -40,11 +67,11 @@ net = img_conv_layer(
     input=net, filter_size=3, num_filters=384, stride=1, padding=1)
 # conv4
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
 
 # conv5
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
 net = img_pool_layer(input=net, pool_size=3, stride=2)
 
 net = fc_layer(
@@ -59,6 +86,9 @@ net = fc_layer(
     layer_attr=ExtraAttr(drop_rate=0.5))
 net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
 
-lab = data_layer('label', num_class)
-loss = cross_entropy(input=net, label=lab)
-outputs(loss)
+if is_infer:
+    outputs(net)
+else:
+    lab = data_layer('label', num_class)
+    loss = cross_entropy(input=net, label=lab)
+    outputs(loss)
diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py
index a88ecac67d9e677f14f6dc24ba9a337b1245243f..3241be9c5f56d3e3b422081e26878ae690a33268 100644
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/env python
 from paddle.trainer_config_helpers import *
 
@@ -6,10 +19,23 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
 use_gpu = get_config_arg('use_gpu', bool, True)
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
 
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer,
+    'num_samples': num_samples
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 
 settings(
     batch_size=batch_size,
@@ -146,7 +172,6 @@ def inception(name, input, channels, \
     return cat
 
 
-lab = data_layer(name="label", size=1000)
 data = data_layer(name="input", size=3 * height * width)
 
 # stage 1
@@ -224,6 +249,10 @@ pool5 = img_pool_layer(
 dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
 out3 = fc_layer(
     name="output3", input=dropout, size=1000, act=SoftmaxActivation())
-loss3 = cross_entropy(name='loss3', input=out3, label=lab)
 
-outputs(loss3)
+if is_infer:
+    outputs(out3)
+else:
+    lab = data_layer(name="label", size=num_class)
+    loss3 = cross_entropy(name='loss3', input=out3, label=lab)
+    outputs(loss3)
diff --git a/benchmark/paddle/image/plotlog.py b/benchmark/paddle/image/plotlog.py
new file mode 100644
index 0000000000000000000000000000000000000000..8679d4f272d1b7aaf8d5a397f07698a6b70e4fcd
--- /dev/null
+++ b/benchmark/paddle/image/plotlog.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import argparse
+import matplotlib.pyplot as plt
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Parse Log')
+    parser.add_argument(
+        '--file_path', '-f', type=str, help='the path of the log file')
+    parser.add_argument(
+        '--sample_rate',
+        '-s',
+        type=float,
+        default=1.0,
+        help='the rate to take samples from log')
+    parser.add_argument(
+        '--log_period', '-p', type=int, default=1, help='the period of log')
+
+    args = parser.parse_args()
+    return args
+
+
+def parse_file(file_name):
+    loss = []
+    error = []
+    with open(file_name) as f:
+        for i, line in enumerate(f):
+            line = line.strip()
+            if not line.startswith('pass'):
+                continue
+            line_split = line.split(' ')
+            if len(line_split) != 5:
+                continue
+
+            loss_str = line_split[2][:-1]
+            cur_loss = float(loss_str.split('=')[-1])
+            loss.append(cur_loss)
+
+            err_str = line_split[3][:-1]
+            cur_err = float(err_str.split('=')[-1])
+            error.append(cur_err)
+
+    accuracy = [1.0 - err for err in error]
+
+    return loss, accuracy
+
+
+def sample(metric, sample_rate):
+    interval = int(1.0 / sample_rate)
+    if interval > len(metric):
+        return metric[:1]
+
+    num = len(metric) / interval
+    idx = [interval * i for i in range(num)]
+    metric_sample = [metric[id] for id in idx]
+    return metric_sample
+
+
+def plot_metric(metric,
+                batch_id,
+                graph_title,
+                line_style='b-',
+                line_label='y',
+                line_num=1):
+    plt.figure()
+    plt.title(graph_title)
+    if line_num == 1:
+        plt.plot(batch_id, metric, line_style, label=line_label)
+    else:
+        for i in range(line_num):
+            plt.plot(batch_id, metric[i], line_style[i], label=line_label[i])
+    plt.xlabel('batch')
+    plt.ylabel(graph_title)
+    plt.legend()
+    plt.savefig(graph_title + '.jpg')
+    plt.close()
+
+
+def main():
+    args = parse_args()
+    assert args.sample_rate > 0. and args.sample_rate <= 1.0, "The sample rate should in the range (0, 1]."
+
+    loss, accuracy = parse_file(args.file_path)
+    batch = [args.log_period * i for i in range(len(loss))]
+
+    batch_sample = sample(batch, args.sample_rate)
+    loss_sample = sample(loss, args.sample_rate)
+    accuracy_sample = sample(accuracy, args.sample_rate)
+
+    plot_metric(loss_sample, batch_sample, 'loss', line_label='loss')
+    plot_metric(
+        accuracy_sample,
+        batch_sample,
+        'accuracy',
+        line_style='g-',
+        line_label='accuracy')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py
index 4703944c8722552d56ba80a8e0663de5fb4df53d..220c4bee35c89e90d7ba5edbb9b92632a0215811 100644
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import io, os
 import random
 import numpy as np
@@ -13,14 +26,21 @@ def initHook(settings, height, width, color, num_class, **kwargs):
         settings.data_size = settings.height * settings.width * 3
     else:
         settings.data_size = settings.height * settings.width
-
-    settings.slots = [dense_vector(settings.data_size), integer_value(1)]
+    settings.is_infer = kwargs.get('is_infer', False)
+    settings.num_samples = kwargs.get('num_samples', 2560)
+    if settings.is_infer:
+        settings.slots = [dense_vector(settings.data_size)]
+    else:
+        settings.slots = [dense_vector(settings.data_size), integer_value(1)]
 
 
 @provider(
     init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_list):
-    for i in xrange(1024):
+    for i in xrange(settings.num_samples):
         img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
-        lab = random.randint(0, settings.num_class - 1)
-        yield img.astype('float32'), int(lab)
+        if settings.is_infer:
+            yield img.astype('float32')
+        else:
+            lab = random.randint(0, settings.num_class - 1)
+            yield img.astype('float32'), int(lab)
diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py
index 6ae1857642e8df4b3859eec68a3a5227d1c4fcb3..acc6d31d4bb4eeca3e1d98e7b42ea8a2bd8bc90b 100644
--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/env python
 from paddle.trainer_config_helpers import *
 
@@ -6,11 +19,23 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg("layer_num", int, 50)
-is_test = get_config_arg("is_test", bool, False)
-
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
+
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer,
+    'num_samples': num_samples
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 
 settings(
     batch_size=batch_size,
@@ -45,7 +70,10 @@ def conv_bn_layer(name,
         act=LinearActivation(),
         bias_attr=False)
     return batch_norm_layer(
-        name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
+        name=name + "_bn",
+        input=tmp,
+        act=active_type,
+        use_global_stats=is_infer)
 
 
 def bottleneck_block(name, input, num_filters1, num_filters2):
@@ -207,7 +235,9 @@ elif layer_num == 152:
 else:
     print("Wrong layer number.")
 
-lbl = data_layer(name="label", size=num_class)
-loss = cross_entropy(name='loss', input=resnet, label=lbl)
-inputs(img, lbl)
-outputs(loss)
+if is_infer:
+    outputs(resnet)
+else:
+    lbl = data_layer(name="label", size=num_class)
+    loss = cross_entropy(name='loss', input=resnet, label=lbl)
+    outputs(loss)
diff --git a/benchmark/paddle/image/run_mkl_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh
new file mode 100755
index 0000000000000000000000000000000000000000..62c9bf6efd3810f506fd4592b2ba3a21b1b7f0e7
--- /dev/null
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@@ -0,0 +1,87 @@
+set -e
+
+function clock_to_seconds() {
+  hours=`echo $1 | awk -F ':' '{print $1}'`
+  mins=`echo $1 | awk -F ':' '{print $2}'`
+  secs=`echo $1 | awk -F ':' '{print $3}'`
+  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
+}
+
+function infer() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  use_mkldnn=$4
+  if [ $4 == "True" ]; then
+    thread=1
+    log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log"
+  elif [ $4 == "False" ]; then
+    thread=`nproc`
+    if [ $thread -gt $bs ]; then
+      thread=$bs
+    fi
+    log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log"
+  else
+    echo "Wrong input $4, use True or False."
+    exit 0
+  fi
+
+  models_in="models/${topology}-${layer_num}/pass-00000/"
+  if [ ! -d $models_in ]; then
+    echo "Training model ${topology}_${layer_num}"
+    paddle train --job=train \
+      --config="${topology}.py" \
+      --use_mkldnn=True \
+      --use_gpu=False \
+      --trainer_count=1 \
+      --num_passes=1 \
+      --save_dir="models/${topology}-${layer_num}" \
+      --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
+      > /dev/null 2>&1
+    echo "Done"
+  fi
+  log_period=$((256 / bs))
+  paddle train --job=test \
+    --config="${topology}.py" \
+    --use_mkldnn=$use_mkldnn \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=$log_period \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
+    --init_model_path=$models_in \
+    2>&1 | tee ${log}
+
+  # calculate the last 5 logs period time of 1280 samples,
+  # the time before are burning time.
+  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  start_sec=`clock_to_seconds $start`
+  end_sec=`clock_to_seconds $end`
+  fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
+  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -f "test.list" ]; then
+  echo " " > test.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+if [ ! -d "models" ]; then
+  mkdir -p models
+fi
+
+# inference benchmark
+for use_mkldnn in True False; do
+  for batchsize in 1 2 4 8 16; do
+    infer vgg 19 $batchsize $use_mkldnn
+    infer resnet 50 $batchsize $use_mkldnn
+    infer googlenet v1 $batchsize $use_mkldnn
+    infer alexnet 2 $batchsize $use_mkldnn
+  done
+done
diff --git a/benchmark/paddle/image/run_mkldnn.sh b/benchmark/paddle/image/run_mkl_train.sh
similarity index 66%
rename from benchmark/paddle/image/run_mkldnn.sh
rename to benchmark/paddle/image/run_mkl_train.sh
index f768f6c29a84b40f917e0ccfde4d8c15f65c818b..03d2d378fb72e36f765d89af788f6ee96fe21d4e 100755
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@@ -8,13 +8,13 @@ function train() {
   use_mkldnn=$4
   if [ $4 == "True" ]; then
     thread=1
-    log="logs/${topology}-${layer_num}-mkldnn-${bs}.log"
+    log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log"
   elif [ $4 == "False" ]; then
     thread=`nproc`
     # each trainer_count use only 1 core to avoid conflict
-    log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
+    log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log"
   else
-    echo "Wrong input $3, use True or False."
+    echo "Wrong input $4, use True or False."
     exit 0
   fi
   args="batch_size=${bs},layer_num=${layer_num}"
@@ -28,19 +28,25 @@ function train() {
     --test_period=100 \
     --config_args=$args \
     2>&1 | tee ${log} 
+
+  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 }
 
-if [ ! -d "train.list" ]; then
+if [ ! -f "train.list" ]; then
   echo " " > train.list
 fi
 if [ ! -d "logs" ]; then
   mkdir logs
 fi
 
+# training benchmark
 for use_mkldnn in True False; do
   for batchsize in 64 128 256; do
     train vgg 19 $batchsize $use_mkldnn
     train resnet 50 $batchsize $use_mkldnn
     train googlenet v1 $batchsize $use_mkldnn
+    train alexnet 2 $batchsize $use_mkldnn
   done
 done
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a9a7b8a66717c4be0543c3fe2db293fe199e3dc4
--- /dev/null
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -0,0 +1,69 @@
+set -e
+
+function clock_to_seconds() {
+  hours=`echo $1 | awk -F ':' '{print $1}'`
+  mins=`echo $1 | awk -F ':' '{print $2}'`
+  secs=`echo $1 | awk -F ':' '{print $3}'`
+  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
+}
+
+function infer() {
+  export OPENBLAS_MAIN_FREE=1
+  topology=$1
+  layer_num=$2
+  bs=$3
+  trainers=`nproc`
+  if [ $trainers -gt $bs ]; then
+    trainers=$bs
+  fi
+  log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
+  threads=$((`nproc` / trainers))
+  if [ $threads -eq 0 ]; then
+    threads=1
+  fi
+  export OPENBLAS_NUM_THREADS=$threads
+
+  models_in="models/${topology}-${layer_num}/pass-00000/"
+  if [ ! -d $models_in ]; then
+    echo "./run_mkl_infer.sh to save the model first"
+    exit 0
+  fi
+  log_period=$((32 / bs))
+  paddle train --job=test \
+    --config="${topology}.py" \
+    --use_mkldnn=False \
+    --use_gpu=False \
+    --trainer_count=$trainers \
+    --log_period=$log_period \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
+    --init_model_path=$models_in \
+    2>&1 | tee ${log}
+
+  # calculate the last 5 logs period time of 160(=32*5) samples,
+  # the time before are burning time.
+  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  start_sec=`clock_to_seconds $start`
+  end_sec=`clock_to_seconds $end`
+  fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
+  echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -f "test.list" ]; then
+  echo " " > test.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# inference benchmark
+for batchsize in 1 2 4 8 16; do
+  infer vgg 19 $batchsize
+  infer resnet 50 $batchsize 
+  infer googlenet v1 $batchsize
+  infer alexnet 2 $batchsize
+done
diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..935cff6f2c97d25d6de556cfee25e27dbe49b5b6
--- /dev/null
+++ b/benchmark/paddle/image/run_openblas_train.sh
@@ -0,0 +1,41 @@
+set -e
+
+function train() {
+  export OPENBLAS_NUM_THREADS=1
+  topology=$1
+  layer_num=$2
+  bs=$3
+  thread=`nproc`
+  # each trainer_count use only 1 core to avoid conflict
+  log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+  args="batch_size=${bs},layer_num=${layer_num}"
+  config="${topology}.py"
+  paddle train --job=time \
+    --config=$config \
+    --use_mkldnn=False \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=3 \
+    --test_period=30 \
+    --config_args=$args \
+    2>&1 | tee ${log} 
+
+  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# training benchmark
+for batchsize in 64 128 256; do
+  train vgg 19 $batchsize
+  train resnet 50 $batchsize
+  train googlenet v1 $batchsize
+  train alexnet 2 $batchsize
+done
diff --git a/benchmark/paddle/image/smallnet_mnist_cifar.py b/benchmark/paddle/image/smallnet_mnist_cifar.py
index 58879c454f37991405d83bbb593bb5d1e977ff53..64a5da3220bf3294b88c015fe133e7b04573d954 100644
--- a/benchmark/paddle/image/smallnet_mnist_cifar.py
+++ b/benchmark/paddle/image/smallnet_mnist_cifar.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/env python
 
 from paddle.trainer_config_helpers import *
diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py
index 420884ed8e1ae36a3f1772bfbe8323f3d0ea71e6..a357207a6282aa7864a7ecffa8c54e7b2360b45e 100644
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/env python
 from paddle.trainer_config_helpers import *
 
@@ -6,10 +19,23 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg('layer_num', int, 19)
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
 
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer,
+    'num_samples': num_samples
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 
 settings(
     batch_size=batch_size,
@@ -98,6 +124,9 @@ elif layer_num == 19:
 else:
     print("Wrong layer number.")
 
-lab = data_layer('label', num_class)
-loss = cross_entropy(input=vgg, label=lab)
-outputs(loss)
+if is_infer:
+    outputs(vgg)
+else:
+    lab = data_layer('label', num_class)
+    loss = cross_entropy(input=vgg, label=lab)
+    outputs(loss)
diff --git a/benchmark/paddle/rnn/provider.py b/benchmark/paddle/rnn/provider.py
index 928ca75daf84ccebb775364b0be0d8b3d5eebff9..c03df3a0026447d5ab239f90600975751d287799 100644
--- a/benchmark/paddle/rnn/provider.py
+++ b/benchmark/paddle/rnn/provider.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import io, os
 import random
 import numpy as np
diff --git a/benchmark/paddle/rnn/rnn.py b/benchmark/paddle/rnn/rnn.py
index 83eb3e565473f7e7e91cddeaa3cd2aafb7e3df2c..97005f2c351ff2301a1b834e79931bb6ccc7abc8 100755
--- a/benchmark/paddle/rnn/rnn.py
+++ b/benchmark/paddle/rnn/rnn.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/env python
 
 from paddle.trainer_config_helpers import *
diff --git a/benchmark/tensorflow/image/alexnet.py b/benchmark/tensorflow/image/alexnet.py
index f6a39ef778e21bee7374718a1b1ddf43392825a8..edf462e6a188f4a83d7276086fcacfb644b562a1 100644
--- a/benchmark/tensorflow/image/alexnet.py
+++ b/benchmark/tensorflow/image/alexnet.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from datetime import datetime
 import math
diff --git a/benchmark/tensorflow/image/alexnet_multi_gpu.py b/benchmark/tensorflow/image/alexnet_multi_gpu.py
index 7b5ee78f4dd5429abd85d75c092a6e3a2a39f922..90b8f16bca027fd14a62c2e65bdb606c6b7d79d9 100644
--- a/benchmark/tensorflow/image/alexnet_multi_gpu.py
+++ b/benchmark/tensorflow/image/alexnet_multi_gpu.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from datetime import datetime
 import math
diff --git a/benchmark/tensorflow/image/googlenet.py b/benchmark/tensorflow/image/googlenet.py
index decf855b54451efba5f6a7868fbcf631789f3572..55431eceb3ce41c05fc5fb2d417135494adf3b2c 100644
--- a/benchmark/tensorflow/image/googlenet.py
+++ b/benchmark/tensorflow/image/googlenet.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from six.moves import xrange
 from datetime import datetime
 import math
diff --git a/benchmark/tensorflow/image/googlenet_multi_gpu.py b/benchmark/tensorflow/image/googlenet_multi_gpu.py
index 31466faa37c47c66e4fe4628e28c867875e89f2e..44de3800a8a42d90debae2c567795789f3eb0a7d 100644
--- a/benchmark/tensorflow/image/googlenet_multi_gpu.py
+++ b/benchmark/tensorflow/image/googlenet_multi_gpu.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from datetime import datetime
 import math
diff --git a/benchmark/tensorflow/image/smallnet_mnist_cifar.py b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
index 1a625134a6c58586b29190ede9c66253f484d2cf..0858b5f9c9c60264c0427c9e1fbdfcd167cf418e 100644
--- a/benchmark/tensorflow/image/smallnet_mnist_cifar.py
+++ b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from datetime import datetime
 import math
diff --git a/benchmark/tensorflow/rnn/reader.py b/benchmark/tensorflow/rnn/reader.py
index f538329a15ea9ad9293c97c94340989e2c421eb2..710940c9ae24eb307ed864aa2b17d66c1d6ee29b 100755
--- a/benchmark/tensorflow/rnn/reader.py
+++ b/benchmark/tensorflow/rnn/reader.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import os.path
 import io
 import numpy as np
diff --git a/benchmark/tensorflow/rnn/rnn.py b/benchmark/tensorflow/rnn/rnn.py
index f288083e13656563b511980553245142efec4e65..507481b9ccd3a037d396f91bb860255a343905c5 100755
--- a/benchmark/tensorflow/rnn/rnn.py
+++ b/benchmark/tensorflow/rnn/rnn.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/env python
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import math
diff --git a/benchmark/tensorflow/rnn/rnn_multi_gpu.py b/benchmark/tensorflow/rnn/rnn_multi_gpu.py
index eabee4fa8fe6325212ace1c11be4862cd2720b08..f24cbaef62d94f2d94c243bd3c8ffcf1dcec7614 100755
--- a/benchmark/tensorflow/rnn/rnn_multi_gpu.py
+++ b/benchmark/tensorflow/rnn/rnn_multi_gpu.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/env python
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import re
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index b21fc43904d9aafe9f7d019dfbe5b1c0d3f9e2d6..6320b17520a687f88993b6f464d9115838b0f96b 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -3,7 +3,7 @@
 # It will search MKLML, atlas, OpenBlas, reference-cblas in order.
 #
 # If any cblas implementation found, the following variable will be set.
-#    CBLAS_PROVIDER  # one of MKLML, ATLAS, OPENBLAS, REFERENCE
+#    CBLAS_PROVIDER  # one of MKLML, OPENBLAS, REFERENCE
 #    CBLAS_INC_DIR   # the include directory for cblas.
 #    CBLAS_LIBS      # a list of libraries should be linked by paddle.
 #                    # Each library should be full path to object file.
@@ -17,7 +17,7 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
   set(CBLAS_INC_DIR ${MKLML_INC_DIR})
   set(CBLAS_LIBRARIES ${MKLML_LIB})
 
-  add_definitions(-DPADDLE_USE_MKLML)
+  add_definitions(-DPADDLE_WITH_MKLML)
   add_definitions(-DLAPACK_FOUND)
 
   message(STATUS "Found cblas and lapack in MKLML "
@@ -25,42 +25,6 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
   return()
 endif()
 
-## Then find atlas.
-set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
-set(ATLAS_INCLUDE_SEARCH_PATHS
-        ${ATLAS_ROOT}/include
-        /usr/include
-        /usr/include/atlas)
-set(ATLAS_LIB_SEARCH_PATHS
-        ${ATLAS_ROOT}/lib
-        /usr/lib
-        /usr/lib/blas/atlas
-        /usr/lib/atlas
-        /usr/lib/atlas-base   # special for ubuntu 14.04.
-    )
-find_path(ATLAS_INC_DIR NAMES cblas.h
-  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
-  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
-  PATHS ${ATLAS_LIB_SEARCH_PATHS})
-find_library(ATLAS_CLAPACK_LIB NAMES lapack_atlas liblapack_atlas.so.3
-  PATHS ${ATLAS_LIB_SEARCH_PATHS})
-
-if(ATLAS_CLAPACK_INC_DIR AND ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_CLAPACK_LIB)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER ATLAS)
-  set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
-  set(CBLAS_LIBRARIES ${ATLAS_CLAPACK_LIB} ${ATLAS_CBLAS_LIB})
-
-  add_definitions(-DPADDLE_USE_ATLAS)
-  add_definitions(-DLAPACK_FOUND)
-
-  message(STATUS "Found ATLAS (include: ${ATLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
-  return()
-endif()
-
 ## Then find openblas.
 set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
 set(OPENBLAS_INCLUDE_SEARCH_PATHS
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e550ec285668ea25757eeee9e7c5dc48fc9d339d..5c6bcfde76a1201f792d04766d698db8cd395a49 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -24,6 +24,11 @@ if(WITH_DOUBLE)
     add_definitions(-DPADDLE_TYPE_DOUBLE)
 endif(WITH_DOUBLE)
 
+if(WITH_ARM_FP16)
+    add_definitions(-DPADDLE_ARM_FP16)
+    add_definitions("-march=armv8.2-a+fp16+simd")
+endif(WITH_ARM_FP16)
+
 if(WITH_TESTING)
     add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)
diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..aec51410b33669f8a549f2eca193cc6aa2d07a13
--- /dev/null
+++ b/cmake/external/cares.cmake
@@ -0,0 +1,45 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+# NOTE: c-ares is needed when linking with grpc.
+
+SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares)
+SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares)
+SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE)
+
+ExternalProject_Add(
+    extern_cares
+    GIT_REPOSITORY "https://github.com/c-ares/c-ares.git"
+    GIT_TAG "cares-1_13_0"
+    PREFIX          ${CARES_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND   make -j8
+    INSTALL_COMMAND make install
+)
+
+ADD_LIBRARY(cares STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION
+             "${CARES_INSTALL_DIR}/lib/libcares.a")
+
+include_directories(${CARES_INCLUDE_DIR})
+ADD_DEPENDENCIES(cares extern_cares)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 96fc886a342cae38d5b804266d3af7bc909a4da2..c4712f19eb80b34ffbf713d2b13fc0c775312af1 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -19,7 +19,7 @@ ExternalProject_Add(
 
 if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
     set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy_eigen3 = \"${dummyfile}\";")
+    file(WRITE ${dummyfile} "const char *dummy_eigen3 = \"${dummyfile}\";")
     add_library(eigen3 STATIC ${dummyfile})
 else()
     add_library(eigen3 INTERFACE)
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index c819eb4d70898e48eab499c666168d78262d4240..d4f252bb9f64c8db82b841fedf0817f5d8596501 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -28,15 +28,8 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 ExternalProject_Add(
     extern_gflags
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    # TODO(yiwang): The annoying warnings mentioned in
-    # https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
-    # gflags.  I fired a PR https://github.com/gflags/gflags/pull/230
-    # to fix it.  Before it gets accepted by the gflags team, we use
-    # my personal fork, which contains above fix, temporarily.  Let's
-    # change this back to the official Github repo once my PR is
-    # merged.
-    GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
-    GIT_TAG         986964c07427ecb9cdb5bd73f73ebbd40e54dadb
+    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
     PREFIX          ${GFLAGS_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 08bdc1e1623b0d917061c7368e9b2a8f7e9517fd..0c6b3aafcb4e990b9d4549820137474e5968a7aa 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -26,12 +26,21 @@ ENDIF(WIN32)
 
 INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
 
+IF(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
+  # Using the unofficial glog for Android API < 21
+  SET(GLOG_REPOSITORY "https://github.com/Xreki/glog.git")
+  SET(GLOG_TAG "8a547150548b284382ccb6582408e9140ff2bea8")
+ELSE()
+  SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
+  SET(GLOG_TAG "v0.3.5")
+ENDIF()
+
 ExternalProject_Add(
     extern_glog
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS gflags
-    GIT_REPOSITORY  "https://github.com/google/glog.git"
-    GIT_TAG         v0.3.5
+    GIT_REPOSITORY  ${GLOG_REPOSITORY}
+    GIT_TAG         ${GLOG_TAG}
     PREFIX          ${GLOG_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..79b2449fe6689993bbee8a24ae7c46b452afe0a0
--- /dev/null
+++ b/cmake/external/grpc.cmake
@@ -0,0 +1,66 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
+SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
+SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
+SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
+IF(APPLE)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+ELSE()
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
+ENDIF()
+
+ExternalProject_Add(
+    extern_grpc
+    DEPENDS protobuf zlib
+    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
+    GIT_TAG "v1.8.x"
+    PREFIX          ${GRPC_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    # NOTE(yuyang18):
+    # Disable -Werror, otherwise the compile will fail in MacOS.
+    # It seems that we cannot configure that by make command.
+    # Just dry run make command and remove `-Werror`, then use a shell to run make commands
+    BUILD_COMMAND  ${BUILD_CMD}
+    INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
+)
+
+# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
+ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
+             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
+
+ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
+ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgpr.a")
+
+ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
+
+include_directories(${GRPC_INCLUDE_DIR})
+ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index fc52d339d7a336b44c97f2e0a9fc8d6604854365..89fc34796a03ff3f3e5b022ae10b2646832b1ac7 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -63,9 +63,30 @@ ExternalProject_Add(
                         -DMKLROOT:PATH=${MKLML_ROOT}
 )
 
-ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
-ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
+ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
+ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
 MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
-add_definitions(-DPADDLE_USE_MKLDNN)
-LIST(APPEND external_project_dependencies mkldnn)
+add_definitions(-DPADDLE_WITH_MKLDNN)
+LIST(APPEND external_project_dependencies shared_mkldnn)
+
+# generate a static dummy target to track mkldnn dependencies
+# for cc_library(xxx SRCS xxx.c DEPS mkldnn)
+SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mkldnn_dummy.c)
+FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+ADD_LIBRARY(mkldnn STATIC ${dummyfile})
+TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_LIB} ${MKLML_IOMP_LIB})
+ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
+
+# copy the real so.0 lib to install dir
+# it can be directly contained in wheel or capi
+SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
+ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
+    COMMAND cp ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
+    DEPENDS mkldnn)
+ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
+
+IF(WITH_C_API)
+  INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
+ENDIF()
+
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 20dbc32a738d982df2d3f035206279c82c8de264..15a07ea3daf5aa606235f20288a8306966334a1a 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -66,3 +66,7 @@ ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
 ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
 LIST(APPEND external_project_dependencies mklml)
+
+IF(WITH_C_API)
+  INSTALL(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib)
+ENDIF()
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 4c4f59656dae68739f2f07f3febd510e727fe2dd..0e79c0cc7992060cbe3b668ec927936183389eb6 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -30,23 +30,21 @@ IF(NOT ${CBLAS_FOUND})
         CACHE FILEPATH "openblas library." FORCE)
 
     SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
+    SET(OPENBLAS_COMMIT "v0.2.20")
 
     IF(CMAKE_CROSSCOMPILING)
         SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
         GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
         SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
         IF(ANDROID)
-            # arm_soft_fp_abi branch of OpenBLAS to support softfp
-            #   https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
-            SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
             IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+                # use softfp
                 SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
             ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
                 SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
             ENDIF()
         ELSEIF(IOS)
             IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
-                SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
                 SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
                 SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
                 SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
@@ -56,14 +54,12 @@ IF(NOT ${CBLAS_FOUND})
             ENDIF()
         ELSEIF(RPI)
             # use hardfp
-            SET(OPENBLAS_COMMIT "v0.2.20")
             SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
         ENDIF()
     ELSE()
         IF(APPLE)
             SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
         ENDIF()
-        SET(OPENBLAS_COMMIT "v0.2.20")
         SET(OPTIONAL_ARGS "")
         IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
             SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
@@ -113,12 +109,8 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 # FIXME(gangliao): generate cblas target to track all high performance
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
-FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
-    ADD_LIBRARY(cblas SHARED ${dummyfile})
-ELSE()
-    ADD_LIBRARY(cblas STATIC ${dummyfile})
-ENDIF()
+FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
+ADD_LIBRARY(cblas STATIC ${dummyfile})
 TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
 
 IF(NOT ${CBLAS_FOUND})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index be7f6a9465970711170bd15dcecaadeaa8a55f86..ff5855052dabaa0b63099cd219f3f04e22f1aa85 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -15,7 +15,18 @@
 INCLUDE(ExternalProject)
 # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
 FIND_PACKAGE(Protobuf QUIET)
-SET(PROTOBUF_FOUND "OFF")
+macro(UNSET_VAR VAR_NAME)
+    UNSET(${VAR_NAME} CACHE)
+    UNSET(${VAR_NAME})
+endmacro()
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(PROTOBUF_FOUND)
+UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
+UNSET_VAR(PROTOBUF_PROTOC_LIBRARY)
+UNSET_VAR(PROTOBUF_LITE_LIBRARY)
+UNSET_VAR(PROTOBUF_LIBRARY)
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
 
 if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
     function(protobuf_generate_python SRCS)
@@ -110,7 +121,6 @@ macro(PROMPT_PROTOBUF_LIB)
     # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
     # make `protobuf_generate_cpp` happy.
     SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
-
     FOREACH(dep ${protobuf_DEPS})
         ADD_DEPENDENCIES(protobuf ${dep})
         ADD_DEPENDENCIES(protobuf_lite ${dep})
@@ -128,11 +138,11 @@ endmacro()
 
 set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
-    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include)
-    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib)
-    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib)
-    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib)
-    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin)
+    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
     if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
         message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
         SET_PROTOBUF_VERSION()
@@ -178,14 +188,26 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
         SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
     ENDIF()
 
+    SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
+    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
+    IF(MOBILE_INFERENCE)
+        # The reason why the official version is not used is described in
+        # https://github.com/PaddlePaddle/Paddle/issues/6114
+        SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git")
+        SET(PROTOBUF_TAG "v3.2.0")
+        IF(NOT BUILD_FOR_HOST)
+            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF")
+        ENDIF()
+    ENDIF()
+
     ExternalProject_Add(
         ${TARGET_NAME}
         ${EXTERNAL_PROJECT_LOG_ARGS}
         PREFIX          ${PROTOBUF_SOURCES_DIR}
         UPDATE_COMMAND  ""
         DEPENDS         zlib
-        GIT_REPOSITORY  "https://github.com/google/protobuf.git"
-        GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
+        GIT_REPOSITORY  ${PROTOBUF_REPO}
+        GIT_TAG         ${PROTOBUF_TAG}
         CONFIGURE_COMMAND
         ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
             ${OPTIONAL_ARGS}
@@ -203,7 +225,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     )
 ENDFUNCTION()
 
-SET(PROTOBUF_VERSION 3.1)
+IF(NOT MOBILE_INFERENCE)
+    SET(PROTOBUF_VERSION 3.1)
+ELSE()
+    SET(PROTOBUF_VERSION 3.2)
+ENDIF()
 IF(CMAKE_CROSSCOMPILING)
     build_protobuf(protobuf_host TRUE)
     LIST(APPEND external_project_dependencies protobuf_host)
@@ -227,9 +253,9 @@ IF(NOT PROTOBUF_FOUND)
     IF(WITH_C_API)
         INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
         IF(ANDROID)
-            INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
         ELSE()
-            INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib)
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib)
         ENDIF()
     ENDIF()
 
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index a8e1aca49c97df256b1269c286b0bce7732fa932..7cb4efa7bff7164464f1210a2b2188226c219ef6 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -63,7 +63,7 @@ ExternalProject_Add(
 MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
 INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
 
-ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL)
+ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
 ADD_DEPENDENCIES(warpctc extern_warpctc)
 
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index a98e069b7cd1654ddd5868560d0905eab6d9c692..1638cd8fdfc34575132462859e056a1907f0b2f1 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -50,6 +50,8 @@ ExternalProject_Add(
 )
 
 LIST(APPEND external_project_dependencies zlib)
+ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
 
 IF(WITH_C_API)
   INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 2b125cef6aa8d1021afe8a7a0d232d84d36be4bc..1120677a37e0d44163816b66600121c8f0d545af 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -111,6 +111,8 @@ set(COMMON_FLAGS
     -Wno-error=sign-compare
     -Wno-error=unused-local-typedefs
     -Wno-error=parentheses-equality # Warnings in pybind11
+    -Wno-error=ignored-attributes  # Warnings in Eigen, gcc 6.3
+    -Wno-error=terminate  # Warning in PADDLE_ENFORCE
 )
 
 set(GPU_COMMON_FLAGS
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index b9c1dde97bc444d793d67ff622fd6b13c6435a9a..585db019d521b1699baadfae31ef95b5059c71b4 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -120,7 +120,7 @@ function(merge_static_libs TARGET_NAME)
       DEPENDS ${libs})
 
     # Generate dummy staic lib
-    file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
+    file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
     add_library(${TARGET_NAME} STATIC ${target_SRCS})
     target_link_libraries(${TARGET_NAME} ${libs_deps})
 
@@ -160,7 +160,7 @@ function(merge_static_libs TARGET_NAME)
       DEPENDS ${libs} ${target_OBJS})
 
     # Generate dummy staic lib
-    file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
+    file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
     add_library(${TARGET_NAME} STATIC ${target_SRCS})
     target_link_libraries(${TARGET_NAME} ${libs_deps})
 
@@ -227,8 +227,8 @@ function(cc_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
     add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 endfunction(cc_test)
@@ -288,8 +288,8 @@ function(nv_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
     add_test(${TARGET_NAME} ${TARGET_NAME})
   endif()
 endfunction(nv_test)
@@ -324,7 +324,7 @@ function(go_library TARGET_NAME)
     )
 
   # Add dummy code to support `make target_name` under Terminal Command
-  file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+  file(WRITE ${dummyfile} "const char *dummy_${TARGET_NAME} = \"${dummyfile}\";")
   if (go_library_SHARED OR go_library_shared)
     add_library(${TARGET_NAME} SHARED ${dummyfile})
   else()
@@ -459,11 +459,58 @@ function(py_test TARGET_NAME)
   if(WITH_TESTING)
     set(options STATIC static SHARED shared)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})  
+    set(multiValueArgs SRCS DEPS ARGS)
+    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
              COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
-             python2 ${py_test_SRCS}
+             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 endfunction()
+
+# grpc_library generate grpc code using grpc_cpp_plugin and protoc
+# then build the generated protobuf code and grpc code with your
+# implementation source codes together. Use SRCS argument for your
+# implementation source files and PROTO argument for your .proto
+# files.
+#
+# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep)
+
+function(grpc_library TARGET_NAME)
+  set(oneValueArgs PROTO)
+  set(multiValueArgs SRCS DEPS)
+  set(options "")
+  cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  message(STATUS "generating grpc ${grpc_library_PROTO}")
+
+  get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE)
+  get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
+  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
+
+  protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
+  set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
+  set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
+  cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}")
+
+  add_custom_command(
+          OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}"
+          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+          ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
+          --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
+          DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
+
+  # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
+  # as compiler warnings instead of error. Should try remove the warnings also.
+  set_source_files_properties(
+    ${grpc_grpc_srcs}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
+
+  set_source_files_properties(
+    ${grpc_library_SRCS}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
+endfunction()
diff --git a/cmake/make_resource.py b/cmake/make_resource.py
index a9241b0e3e36c2e79c79e46b4f9114b7f6947341..d71e82eca2cd10179d5ec498f7fb2aa5da679c9f 100644
--- a/cmake/make_resource.py
+++ b/cmake/make_resource.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import os
 import re
 import sys
diff --git a/cmake/util.cmake b/cmake/util.cmake
index ad905ab55ba3537054fa5b30b5fca4d83c406702..0dc33ce385175d1e2dc454d41db467d4b9d9cf9a 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -168,17 +168,3 @@ function(create_resources res_file output_file)
     COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file}
     DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py)
 endfunction()
-
-
-# Create a python unittest using run_python_tests.sh,
-# which takes care of making correct running environment
-function(add_python_test TEST_NAME)
-    foreach(arg ${ARGN})
-        get_filename_component(py_fn ${arg} NAME_WE)
-        set(TRG_NAME ${TEST_NAME}_${py_fn})
-        add_test(NAME ${TRG_NAME}
-                COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
-                python2 ${arg}
-                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-    endforeach()
-endfunction()
diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst
index 9be0b370ee5e301aee4a6e31b1cfa905754968e8..84f9097a6cdc2da269bd6a0685796e14e26da37e 100644
--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
@@ -7,3 +7,4 @@ API
     模型配置 <v2/model_configs.rst>
     数据访问 <v2/data.rst>
     训练与应用 <v2/run_logic.rst>
+    v2/fluid.rst
diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst
index 25c1dd00b9cbb3ab647e04cdc2b4c27c552a2332..e6f632e1a5b9c4b50b7c6aa96a120030bd6ce338 100644
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
@@ -7,3 +7,4 @@ API
     v2/model_configs.rst
     v2/data.rst
     v2/run_logic.rst
+    v2/fluid.rst
diff --git a/doc/api/v1/data_provider/src/mnist_config.py b/doc/api/v1/data_provider/src/mnist_config.py
index 429338c57f8f865f0c5835d933445b65ee2ea7aa..427e0465a68b630ff8a14337e326777f41b6481a 100644
--- a/doc/api/v1/data_provider/src/mnist_config.py
+++ b/doc/api/v1/data_provider/src/mnist_config.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 define_py_data_sources2(
diff --git a/doc/api/v1/data_provider/src/mnist_provider.dict.py b/doc/api/v1/data_provider/src/mnist_provider.dict.py
index 2ba0b126a0d6239f84950e130410aaaa6e1f24cd..3fbb783e2f66273ce79c6736a00d01ec58514bc9 100644
--- a/doc/api/v1/data_provider/src/mnist_provider.dict.py
+++ b/doc/api/v1/data_provider/src/mnist_provider.dict.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer.PyDataProvider2 import *
 
 
diff --git a/doc/api/v1/data_provider/src/sentimental_config.py b/doc/api/v1/data_provider/src/sentimental_config.py
index 7ce71608a2372b2484ae40ccf01f0621728ddef2..edbf3cf1400f5fde5dfa225e5bdbb60400a0691c 100644
--- a/doc/api/v1/data_provider/src/sentimental_config.py
+++ b/doc/api/v1/data_provider/src/sentimental_config.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 dictionary = dict()
diff --git a/doc/api/v1/data_provider/src/sentimental_provider.py b/doc/api/v1/data_provider/src/sentimental_provider.py
index 14bd0e05a921dbfd5212d8483524d3af3e4ae98f..03ad1fe7d8c66233b078f94aa303e27cffb8e83c 100644
--- a/doc/api/v1/data_provider/src/sentimental_provider.py
+++ b/doc/api/v1/data_provider/src/sentimental_provider.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer.PyDataProvider2 import *
 
 
diff --git a/doc/api/v2/config/activation.rst b/doc/api/v2/config/activation.rst
index eca3ce03bcdc599edca802d8dfca48d4f28275a2..5317e66b64bbd85c61f19700a9d2c1d239dee573 100644
--- a/doc/api/v2/config/activation.rst
+++ b/doc/api/v2/config/activation.rst
@@ -99,3 +99,10 @@ STanh
 ..  automodule:: paddle.v2.activation
     :members: STanh
     :noindex:
+    
+SoftSign
+========
+
+..  automodule:: paddle.v2.activation
+    :members: SoftSign
+    :noindex:
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index d4d182f6692e09b3e40f3620b77d9a0f20ec5af3..ddf0b055a92d80295b24255a5462d477e0d9c796 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -54,7 +54,7 @@ img_conv
 
 ..  _api_v2.layer_context_projection:
 
-context_projection 
+context_projection
 ------------------
 ..  autoclass:: paddle.v2.layer.context_projection
     :noindex:
@@ -70,7 +70,7 @@ Image Pooling Layer
 img_pool
 --------
 ..  autoclass:: paddle.v2.layer.img_pool
-    :noindex:   
+    :noindex:
 
 spp
 ---
@@ -104,7 +104,7 @@ sum_to_one_norm
 ---------------
 ..  autoclass:: paddle.v2.layer.sum_to_one_norm
     :noindex:
-    
+
 cross_channel_norm
 ------------------
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
@@ -114,7 +114,7 @@ row_l2_norm
 -----------
 ..  autoclass:: paddle.v2.layer.row_l2_norm
     :noindex:
-    
+
 Recurrent Layers
 ================
 
@@ -252,6 +252,11 @@ first_seq
 ..  autoclass:: paddle.v2.layer.first_seq
     :noindex:
 
+sub_seq
+---------
+..  autoclass:: paddle.v2.layer.sub_seq
+    :noindex:
+
 concat
 ------
 ..  autoclass:: paddle.v2.layer.concat
@@ -415,6 +420,13 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
     :noindex:
 
+Factorization Machine Layer
+============================
+
+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:
 
 Slicing and Joining Layers
 ==========================
@@ -460,7 +472,7 @@ lambda_cost
     :noindex:
 
 square_error_cost
---------
+-----------------
 ..  autoclass:: paddle.v2.layer.square_error_cost
     :noindex:
 
@@ -526,7 +538,7 @@ Miscs
 =====
 
 dropout
---------------
+--------
 ..  autoclass:: paddle.v2.layer.dropout
     :noindex:
 
diff --git a/doc/api/v2/fluid.rst b/doc/api/v2/fluid.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5f15cad2b530dfb3702357b3c26885ac2a7b7beb
--- /dev/null
+++ b/doc/api/v2/fluid.rst
@@ -0,0 +1,18 @@
+======================
+Fluid
+======================
+
+..  toctree::
+    :maxdepth: 1
+
+    fluid/layers.rst
+    fluid/data_feeder.rst
+    fluid/executor.rst
+    fluid/initializer.rst
+    fluid/evaluator.rst
+    fluid/nets.rst
+    fluid/optimizer.rst
+    fluid/param_attr.rst
+    fluid/profiler.rst
+    fluid/regularizer.rst
+    fluid/io.rst
diff --git a/doc/api/v2/fluid/data_feeder.rst b/doc/api/v2/fluid/data_feeder.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0fa78f7dfb04c13be7eb83b7fd35cb03f2f4a7fa
--- /dev/null
+++ b/doc/api/v2/fluid/data_feeder.rst
@@ -0,0 +1,9 @@
+===========
+DataFeeder
+===========
+
+DataFeeder
+-----------
+..  automodule:: paddle.v2.fluid.data_feeder
+    :members: DataFeeder
+    :noindex:
diff --git a/doc/api/v2/fluid/evaluator.rst b/doc/api/v2/fluid/evaluator.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a23f3301d0331e0ea3733f06444515eb4680cd31
--- /dev/null
+++ b/doc/api/v2/fluid/evaluator.rst
@@ -0,0 +1,9 @@
+===========
+Evaluator
+===========
+
+Evaluator
+-----------
+..  automodule:: paddle.v2.fluid.evaluator
+    :members: Evaluator
+    :noindex:
diff --git a/doc/api/v2/fluid/executor.rst b/doc/api/v2/fluid/executor.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3a283538c120cfa1ef646c390bb71c6251c23675
--- /dev/null
+++ b/doc/api/v2/fluid/executor.rst
@@ -0,0 +1,9 @@
+===========
+Executor
+===========
+
+Executor
+-----------
+..  automodule:: paddle.v2.fluid.executor
+    :members: Executor
+    :noindex:
diff --git a/doc/api/v2/fluid/initializer.rst b/doc/api/v2/fluid/initializer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8f587837e9873370722062404f511654a9460587
--- /dev/null
+++ b/doc/api/v2/fluid/initializer.rst
@@ -0,0 +1,50 @@
+===========
+Initializer
+===========
+
+
+
+Initializer
+-----------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: Initializer
+    :noindex:
+
+
+
+ConstantInitializer
+-------------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: ConstantInitializer
+    :noindex:
+
+
+
+UniformInitializer
+------------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: UniformInitializer
+    :noindex:
+
+
+
+NormalInitializer
+-----------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: NormalInitializer
+    :noindex:
+
+
+XavierInitializer
+-----------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: XavierInitializer
+    :noindex:
+
+
+MSRAInitializer
+---------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: MSRAInitializer
+    :noindex:
+
diff --git a/doc/api/v2/fluid/io.rst b/doc/api/v2/fluid/io.rst
new file mode 100644
index 0000000000000000000000000000000000000000..67f68c4e9e16b379207b8de114cdf769e056f78e
--- /dev/null
+++ b/doc/api/v2/fluid/io.rst
@@ -0,0 +1,10 @@
+===========
+IO
+===========
+
+
+
+is_parameter
+-----------
+..  autofunction:: paddle.v2.fluid.io.is_parameter
+    :noindex:
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
new file mode 100644
index 0000000000000000000000000000000000000000..62c154e65dcff1bdfb00109bf1b724c34731652e
--- /dev/null
+++ b/doc/api/v2/fluid/layers.rst
@@ -0,0 +1,495 @@
+==========
+Layers
+==========
+
+
+fc
+---
+..  autofunction:: paddle.v2.fluid.layers.fc
+    :noindex:
+
+embedding
+---------
+..  autofunction:: paddle.v2.fluid.layers.embedding
+    :noindex:
+
+dynamic_lstm
+------------
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
+    :noindex:
+
+data
+----
+..  autofunction:: paddle.v2.fluid.layers.data
+    :noindex:
+
+mean
+----
+..  autofunction:: paddle.v2.fluid.layers.mean
+    :noindex:
+
+mul
+---
+..  autofunction:: paddle.v2.fluid.layers.mul
+    :noindex:
+
+elementwise_add
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_add
+    :noindex:
+
+elementwise_sub
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_sub
+    :noindex:
+
+elementwise_mul
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_mul
+    :noindex:
+
+elementwise_div
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_div
+    :noindex:
+
+
+dropout
+-------
+..  autofunction:: paddle.v2.fluid.layers.dropout
+    :noindex:
+
+
+reshape
+--------
+..  autofunction:: paddle.v2.fluid.layers.reshape
+    :noindex:
+
+
+sigmoid
+---------
+..  autofunction:: paddle.v2.fluid.layers.sigmoid
+    :noindex:
+
+
+scale
+---------
+..  autofunction:: paddle.v2.fluid.layers.scale
+    :noindex:
+
+
+transpose
+---------
+..  autofunction:: paddle.v2.fluid.layers.transpose
+    :noindex:
+
+
+sigmoid_cross_entropy_with_logits
+---------------------------------
+..  autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
+    :noindex:
+
+
+cast
+----
+..  autofunction:: paddle.v2.fluid.layers.cast
+    :noindex:
+
+
+concat
+-------
+..  autofunction:: paddle.v2.fluid.layers.concat
+    :noindex:
+
+
+sums
+----
+..  autofunction:: paddle.v2.fluid.layers.sums
+    :noindex:
+
+
+linear_chain_crf
+----------------
+..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+    :noindex:
+
+
+assign
+-------
+..  autofunction:: paddle.v2.fluid.layers.embedding
+    :noindex:
+
+
+split_lod_tensor
+----------------
+..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
+    :noindex:
+
+
+merge_lod_tensor
+----------------
+..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
+    :noindex:
+
+cos_sim
+--------
+..  autofunction:: paddle.v2.fluid.layers.cos_sim
+    :noindex:
+
+
+cross_entropy
+-------------
+..  autofunction:: paddle.v2.fluid.layers.cross_entropy
+    :noindex:
+
+
+
+square_error_cost
+-----------------
+..  autofunction:: paddle.v2.fluid.layers.square_error_cost
+    :noindex:
+
+
+accuracy
+---------
+..  autofunction:: paddle.v2.fluid.layers.accuracy
+    :noindex:
+
+
+sequence_conv
+-------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_conv
+    :noindex:
+
+
+conv2d
+------
+..  autofunction:: paddle.v2.fluid.layers.conv2d
+    :noindex:
+
+
+sequence_pool
+-------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_pool
+    :noindex:
+
+
+sequence_first_step
+-------------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
+    :noindex:
+
+
+sequence_last_step
+------------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
+    :noindex:
+
+
+pool2d
+------
+..  autofunction:: paddle.v2.fluid.layers.pool2d
+    :noindex:
+
+
+batch_norm
+----------
+..  autofunction:: paddle.v2.fluid.layers.batch_norm
+    :noindex:
+
+
+beam_search_decode
+------------------
+..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
+    :noindex:
+
+
+lod_rank_table
+--------------
+..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
+    :noindex:
+
+
+max_sequence_len
+----------------
+..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
+    :noindex:
+
+
+topk
+-----
+..  autofunction:: paddle.v2.fluid.layers.topk
+    :noindex:
+
+
+lod_tensor_to_array
+-------------------
+..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
+    :noindex:
+
+
+
+array_to_lod_tensor
+-------------------
+..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
+    :noindex:
+
+
+
+
+fill_constant
+-------------
+..  autofunction:: paddle.v2.fluid.layers.fill_constant
+    :noindex:
+
+
+
+fill_constant_batch_size_like
+-----------------------------
+..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
+    :noindex:
+
+
+ones
+----
+..  autofunction:: paddle.v2.fluid.layers.ones
+    :noindex:
+
+
+zeros
+-----
+..  autofunction:: paddle.v2.fluid.layers.zeros
+    :noindex:
+
+
+increment
+---------
+..  autofunction:: paddle.v2.fluid.layers.increment
+    :noindex:
+
+
+array_write
+-----------
+..  autofunction:: paddle.v2.fluid.layers.array_write
+    :noindex:
+
+
+
+create_array
+------------
+..  autofunction:: paddle.v2.fluid.layers.create_array
+    :noindex:
+
+
+less_than
+---------
+..  autofunction:: paddle.v2.fluid.layers.less_than
+    :noindex:
+
+
+array_read
+----------
+..  autofunction:: paddle.v2.fluid.layers.array_read
+    :noindex:
+
+
+shrink_memory
+--------------
+..  autofunction:: paddle.v2.fluid.layers.shrink_memory
+    :noindex:
+
+
+array_length
+-------------
+..  autofunction:: paddle.v2.fluid.layers.array_length
+    :noindex:
+
+
+conv2d_transpose
+----------------
+..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
+    :noindex:
+
+
+sequence_expand
+---------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+    :noindex:
+
+
+gru_unit
+--------
+..  autofunction:: paddle.v2.fluid.layers.gru_unit
+    :noindex:
+
+
+lstm_unit
+---------
+..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+    :noindex:
+
+
+sequence_softmax
+----------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
+    :noindex:
+
+
+reduce_sum
+----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_sum
+    :noindex:
+
+
+reduce_mean
+-----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_mean
+    :noindex:
+
+
+reduce_max
+----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_max
+    :noindex:
+
+
+reduce_min
+----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_min
+    :noindex:
+
+
+split
+-----
+..  autofunction:: paddle.v2.fluid.layers.split
+    :noindex:
+
+logsigmoid
+----------
+..  autofunction:: paddle.v2.fluid.layers.logsigmoid
+    :noindex:
+
+exp
+---
+..  autofunction:: paddle.v2.fluid.layers.exp
+    :noindex:
+
+relu
+----
+..  autofunction:: paddle.v2.fluid.layers.relu
+    :noindex:
+
+tanh
+----
+..  autofunction:: paddle.v2.fluid.layers.tanh
+    :noindex:
+
+tanh_shrink
+-----------
+..  autofunction:: paddle.v2.fluid.layers.tanh_shrink
+    :noindex:
+
+softshrink
+----------
+..  autofunction:: paddle.v2.fluid.layers.softshrink
+    :noindex:
+
+sqrt
+----
+..  autofunction:: paddle.v2.fluid.layers.sqrt
+    :noindex:
+
+abs
+----
+..  autofunction:: paddle.v2.fluid.layers.abs
+    :noindex:
+
+ceil
+----
+..  autofunction:: paddle.v2.fluid.layers.ceil
+    :noindex:
+
+floor
+-----
+..  autofunction:: paddle.v2.fluid.layers.floor
+    :noindex:
+
+round
+-----
+..  autofunction:: paddle.v2.fluid.layers.round
+    :noindex:
+
+reciprocal
+----------
+..  autofunction:: paddle.v2.fluid.layers.reciprocal
+    :noindex:
+
+log
+---
+..  autofunction:: paddle.v2.fluid.layers.log
+    :noindex:
+
+square
+------
+..  autofunction:: paddle.v2.fluid.layers.square
+    :noindex:
+
+softplus
+--------
+..  autofunction:: paddle.v2.fluid.layers.softplus
+    :noindex:
+
+softsign
+---------
+..  autofunction:: paddle.v2.fluid.layers.softsign
+    :noindex:
+
+brelu
+-----
+..  autofunction:: paddle.v2.fluid.layers.brelu
+    :noindex:
+
+leaky_relu
+----------
+..  autofunction:: paddle.v2.fluid.layers.leaky_relu
+    :noindex:
+
+soft_relu
+---------
+..  autofunction:: paddle.v2.fluid.layers.soft_relu
+    :noindex:
+
+elu
+----
+..  autofunction:: paddle.v2.fluid.layers.elu
+    :noindex:
+
+relu6
+-----
+..  autofunction:: paddle.v2.fluid.layers.relu6
+    :noindex:
+
+pow
+----
+..  autofunction:: paddle.v2.fluid.layers.pow
+    :noindex:
+
+hard_shrink
+-----------
+..  autofunction:: paddle.v2.fluid.layers.hard_shrink
+    :noindex:
+
+thresholded_relu
+----------------
+..  autofunction:: paddle.v2.fluid.layers.thresholded_relu
+    :noindex:
+
+hard_sigmoid
+-------------
+..  autofunction:: paddle.v2.fluid.layers.hard_sigmoid
+    :noindex:
+
+swish
+------
+..  autofunction:: paddle.v2.fluid.layers.swish
+    :noindex:
diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst
new file mode 100644
index 0000000000000000000000000000000000000000..cca0dcdf082c0d809fab1aebba2c0b6c7b8efa2a
--- /dev/null
+++ b/doc/api/v2/fluid/nets.rst
@@ -0,0 +1,27 @@
+===========
+Nets
+===========
+
+simple_img_conv_pool
+--------------------
+..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
+    :noindex:
+
+
+img_conv_group
+---------------
+..  autofunction:: paddle.v2.fluid.nets.img_conv_group
+    :noindex:
+
+
+sequence_conv_pool
+------------------
+..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
+    :noindex:
+
+
+glu
+---
+..  autofunction:: paddle.v2.fluid.nets.glu
+    :noindex:
+
diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..19b4940f08de3e2f7dc177f2961e538946d10a78
--- /dev/null
+++ b/doc/api/v2/fluid/optimizer.rst
@@ -0,0 +1,54 @@
+===========
+Optimizer
+===========
+
+Optimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: Optimizer
+    :noindex:
+
+
+SGDOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: SGDOptimizer
+    :noindex:
+
+
+
+MomentumOptimizer
+-----------------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: MomentumOptimizer
+    :noindex:
+
+
+
+AdagradOptimizer
+----------------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdagradOptimizer
+    :noindex:
+
+
+AdamOptimizer
+-------------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdamOptimizer
+    :noindex:
+
+
+AdamaxOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdamaxOptimizer
+    :noindex:
+
+
+DecayedAdagradOptimizer
+-----------------------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: DecayedAdagradOptimizer
+    :noindex:
+
diff --git a/doc/api/v2/fluid/param_attr.rst b/doc/api/v2/fluid/param_attr.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ca0c8af9e8c4f2271de7a131ad0d27c0e8635f50
--- /dev/null
+++ b/doc/api/v2/fluid/param_attr.rst
@@ -0,0 +1,11 @@
+===========
+ParamAttr
+===========
+
+
+
+ParamAttr
+-----------
+..  automodule:: paddle.v2.fluid.param_attr
+    :members: ParamAttr
+    :noindex:
diff --git a/doc/api/v2/fluid/profiler.rst b/doc/api/v2/fluid/profiler.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7d4042d1f41c12c4a551ba6576559d612116872a
--- /dev/null
+++ b/doc/api/v2/fluid/profiler.rst
@@ -0,0 +1,10 @@
+===========
+Profiler
+===========
+
+
+
+Profiler
+-----------
+..  autofunction:: paddle.v2.fluid.profiler.cuda_profiler
+    :noindex:
diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..868e225ed3d59e79aeb217fb88081ea25f80fa2c
--- /dev/null
+++ b/doc/api/v2/fluid/regularizer.rst
@@ -0,0 +1,25 @@
+===========
+Regularizer
+===========
+
+WeightDecayRegularizer
+----------------------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: WeightDecayRegularizer
+    :noindex:
+
+
+L2DecayRegularizer
+------------------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: L2DecayRegularizer
+    :noindex:
+
+
+
+L1DecayRegularizer
+-------------------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: L1DecayRegularizer
+
+
diff --git a/doc/design/backward.md b/doc/design/backward.md
new file mode 100644
index 0000000000000000000000000000000000000000..20fda7a98f514a3f1c1c2d0ba7447ec954b21d5a
--- /dev/null
+++ b/doc/design/backward.md
@@ -0,0 +1,158 @@
+# Backward Building
+
+## Motivation
+
+In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. However, when configuring the model structure, users do not need to define the backward part. So a mechanism is required by the framework which can complete the model's backward part automatically according to the given forward part.
+
+When implementing a specific `op`, the developer is also asked to implement its backward version, called `grad_op`. A `grad_op` takes gradients of its corresponding `op`'s outputs, and calculate gradients of the `op`'s inputs. During the building of a model's backward part, the framework creates each forward `op`'s `grad_op`, and then string them together in reverse order of forwarding part. In this way, gradients spread from the end to the beginning of the model, in another word, from the loss to parameters.
+
+## Challenges
+
+The motivation of backward building is apparent. However, implementation it correctly is not so easy. In the **Fluid** design, a deep learning model is described by `Program`, `Block`, `Op` and `Variable`. The `Block` itself can be nested. It means that the `op`s and `variable`s are scattered across different blocks rather than all be gathered in a single graph. Our backward building algorithm shall visit blocks in recursive order and be able to insert `grad_op`s and new created `variable`s into the right place. 
+
+## Usage
+
+Although the whole algorithm is comprised of many functions, only one is exposed as API:
+
+```python
+def append_backward(loss, parameter_list=None, no_grad_set=None):
+    """
+    Append backward part to main_program
+
+    Args:
+        loss(Variable): The variable generated by the cost function.
+        parameter_list(list): Parameters that need to be updated by optimizers.
+            If None, it means all parameters need to be updated.
+
+        no_grad_set(set): Variables that have no gradients in Block 0. 
+            If None, the set will be generated inside the function and 
+            contains all variables with `step_gradient=True` from all blocks.
+        
+    Return:
+        (list[Variable]): list of (parameters, gradients) pair.
+    """
+```
+
+By invoking this API, the framework appends backward part of the program where the `loss` is. It takes three arguments. `loss` means the final loss value. It must be a scalar and is usually the output of the loss layer. It is also where the gradient generated and backpropagation starts. `parameter_list` marks all parameters needs updating. If it's `None`, all parameter will be updated by optimizers. `no_grad_set` marks variables without gradient. if all outputs of some `grad_op` are in `no_grad_set`, the `grad_op` will not be run.
+
+This API will be invoked automatically before optimizer building. 
+As a result, in most cases, users do not need to invoke the API by themselves to append backward part.
+
+## Implementation
+
+The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided into two independent parts: creating `grad_op`s and creating new variables. 
+
+### Creating `grad_op`s
+
+The creating of `grad_op`s is implemented by:
+
+```python
+def _append_backward_ops_(target,
+                          block,
+                          target_block,
+                          no_grad_dict,
+                          grad_to_var):
+    """
+    Create all grad ops, and insert them into given block
+
+    Args:
+        target(Variable): the target variable of forward pass
+        block(Block): the block where forward ops are
+        target_block(Block): the block which is going to hold new generated grad ops
+        no_grad_dict(dict): 
+            key(int)  block index
+            val(set) a set of varibale names. These varibales have no gradient
+        grad_to_var(dict)(output argument):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+    """
+```
+
+Given a `block`, the function will traverses all `op`s in this block in reverse order, gets corresponding `grad_op` from the C++ core via `core.get_grad_op_desc()`, then append it to `target_block`. 
+
+However, some specific `op`(e.g. `while_op`, `if_else_op`) can hold its own sub-block. For these sub-blocks contains `op`s as well, the `grad_op` creating should be recursive.
+
+During the reverse traversal, we check each `op` whether it has an attribute named `sub_block`. If so, it means there is a sub-block and we need to deal with it first. After creating a new block whose father is the one in `op`'s attribute, we invoke `_append_backward_ops_()` recursively, assigning the new block to parameter `target_block` and the one in `op`'s attribute to `block`. The *pseudo-code* shows this process:
+
+```
+******* pseudo-code ********
+for op in reversed(block.ops):
+    if op has an attribute named 'sub_block':
+        Get the sub-block(`s_block`) from op's attribute.
+        Create a new block(`grad_s_block`), whose father is `s_block`.
+        Invoke _append_backward_ops_(), with `block=s_block` and `target_block=grad_s_block`
+    
+    Invoke `core.get_grad_op_desc()` to get op's grad_op.
+    Insert name correspondings between variables and their gradients of the grad_op to grad_to_var
+    Assign grad_s_block to grad_op as it's 'sub_block' attribute.
+    Append grad_op to current target_block.
+```
+
+The first invoking of `_append_backward_ops_()` is initiated by `append_backward()`, in which parameters `block` and `target_block` are all assigned with root block(the block with index 0).
+
+### Corner Cases of `grad_op` Creating
+
+In the previous section, we show the regular process of `grad_op` creating. However, in some corner cases, the conventional algorithm is not enough to get the correct result and appending handling is required. These additional processes run after the algorithm mentioned above and do some special adjusts on its output `grad_op`s.
+
+#### Shared Variables
+
+If a variable is read by more than one `op` in the forward pass, its gradient is likely to be written by more than one `grad_op`s in the next backward pass. To make the gradient result being the sum of all `grad_op`s' outputs instead of the last running one, we assign each output with a temporary variable and then add a `sum_op` to add them up. 
+
+For the debug convenience, if the final gradient name is `w@GRAD`, it's corresponding temporary variables will be named as `w@GRAD@RENAME@0`, `w@GRAD@RENAME@1`...
+
+See function `_addup_repetitive_outputs_` in `backward.py` for implementation details.
+
+#### No Gradient Variables
+
+In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass. 
+
+Another situation is all the gradient inputs of some `grad_op` are marked as *no_gradient*, which means all of them can be considered as zeros. For `grad_op`s are in essence the propagation of gradients, all the outputs are definitely zeros when all gradient inputs are zeros. Therefore the `grad_op` can also be skipped.
+
+It should be noted that all these zero gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros. 
+
+This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes who can be skipped and inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False). 
+
+### Creating Backward Variables
+
+Up to now, we have completed all creating and adjusting jobs of `grad_op`s. However, backward variables have not been created. Now they are only represented by `grad_op`'s input and output arguments. The backward variable creating job will be done by:
+
+```python
+def _append_backward_vars_(block, 
+                           start_op_idx, 
+                           grad_to_var, 
+                           grad_info_map):
+    """
+    Create new variables required by backward pass.
+
+    Args:
+        block(Block): the block where new variables will be created
+        start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
+        grad_to_var(dict):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+            In most cases, this dict is generated by _append_backward_ops_()
+        grad_info_map(dict)(output argument):
+            key(str): forward variable name
+            val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index
+    """
+```
+
+Given a `block`, this function traverses all the `grad_op`s in it(The argument `start_op_idx` indicates where the grad_op sequence starts.) and creates all the uncreated outputs. The *pseudo-code* shows this process:
+
+```
+for op in block.ops[start_op_idx : ]:
+
+    if op has an attribute named 'sub_block':
+        Get the sub-block(`s_block`) from op's attribute.
+        Invoke _append_backward_vars_(), with `block=s_block`
+        
+    for var_name in op.all_output_names():
+        if block.has_var_recursive(var_name) or var_name is the name of empty variable:
+            continue
+        create a new variable named 'var_name' in block
+        if grad_to_var.has_key(var_name):
+            set grad_info_map[grad_to_var[var_name]] as a tuple of (var_name. block)
+            
+    do op's var type inference
+    do op's shape inference
+```
diff --git a/doc/design/block.md b/doc/design/block.md
index 4066122c0e8dfa33776796c3d205ba5aec9e0f52..907a2def557fd472ac4d679c73447bd9107d1190 100644
--- a/doc/design/block.md
+++ b/doc/design/block.md
@@ -202,8 +202,8 @@ This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing
 
 During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
 
-VarDesc in a block should have its name scope to avoid local variables affect parent block's name scope.
-Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example:
+VarDesc in a block should have its name scope to avoid local variables affecting parent block's name scope.
+Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that is stored in the parent block. For example:
 
 ```python
 a = pd.Variable(shape=[20, 20])
@@ -291,10 +291,10 @@ public:
   }
 
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& place) const override {
     PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
     for (auto& op : runtime_table_.ops()) {
-      op->Run(scope, dev_ctx);
+      op->Run(scope, place);
     }
   }
 
diff --git a/doc/design/ci_build_whl.png b/doc/design/ci_build_whl.png
new file mode 100644
index 0000000000000000000000000000000000000000..232762b82a9ae3e979a1f38a7beb715c87438f40
Binary files /dev/null and b/doc/design/ci_build_whl.png differ
diff --git a/doc/design/concurrent_programming.md b/doc/design/concurrent_programming.md
new file mode 100644
index 0000000000000000000000000000000000000000..afc65e831d58ff427663806e56294292ccbef85b
--- /dev/null
+++ b/doc/design/concurrent_programming.md
@@ -0,0 +1,163 @@
+# Design Doc: Concurrent Programming with Fluid
+
+With PaddlePaddle Fluid, users describe a program other than a model.  The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.   
+
+Many know that when we program TensorFlow, we can specify the device on which each operator runs.  This allows us to create a concurrent/parallel AI application.   An interesting questions is **how does a `ProgramDesc` represents a concurrent program?**  
+
+The answer relies on the fact that a `ProgramDesc` is similar to an abstract syntax tree (AST) that describes a program.  So users just program a concurrent program that they do with any concurrent programming language, e.g., [Go](https://golang.org).
+
+## An Analogy
+
+The following table compares concepts in Fluid and Go
+
+| Go | Fluid |
+|----|-------|
+|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid) |
+| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
+| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
+| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |
+
+## An Example Concurrent Program
+
+To review all above concepts in an example, let us take a simple program and writes its distributed version.
+
+Suppose that we want to parallelize a naive Fluid program (written in Go and calling Fluid's Go binding) that multiplies two tensors.
+
+```go
+import "fluid"
+
+func paddlepaddle() {
+  X = fluid.read(...)
+  W = fluid.Tensor(...)
+  Y = fluid.mult(X, W)
+}
+```
+
+Please be aware that the Fluid's Go binding provides the default `main` function, which calls the `paddlepaddle` function, which, in this case, is defined in above program and creates the following `ProgramDesc` message.
+
+```protobuf
+message ProgramDesc {
+  block[0] = Block {
+    vars = [X, W, Y],
+    ops = [
+      read(output = X)
+      assign(input = ..., output = W)
+      mult(input = {X, W}, output = Y)
+    ],
+  }
+}
+```
+
+Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
+
+The default `main` function is defined as follows:
+
+```go
+func main() {
+  paddlepaddle()
+  fluid.run()
+}
+```
+
+## The Concurrent Version
+
+By parallelizing the above program, we could support very big tensor X by splitting into small pieces {x_1, x_2, ...} and sent each piece to worker process/node for parallel multiplication.
+
+In this case, we can write a transpiler that takes a `ProgramDesc` message that represents the above example program and outputs two `ProgramDesc` messages, one for running on the master process/node, and the other one for worker processes/nodes.
+
+### The Master Program
+
+The master program could look like the following:
+
+```protobuf
+message ProgramDesc {
+  block[0] = Block {
+    vars = [X, L, Y],
+    ops = [
+      read(output = X)
+      kube_get_workers_addrs(output = L)
+      Y = tensor_array(len(L))
+      parallel_for(input = X, output = Y, 
+                   attrs = {L, block_id(1)}) # referring to block 1
+    ]
+  }
+  
+  block[1] = Block {
+    parent = 0,
+    vars = [x, y, index],
+    ops = [
+      slice(input = [X, index], output = x) # index is initialized by parallel_for
+      send(input = x, attrs = L[index])
+      recv(outputs = y, attrs = L[index])
+      assign(input = y, output = Y[index])
+    ]
+  }
+}
+```
+
+The equivalent Fluid program (calling the Go binding) is:
+
+```go
+func main() {  //// block 0
+  X = fluid.read(...)
+  L = fluid.k8s.get_worker_addrs()
+  Y = fluid.tensor_array(len(L))
+  fluid.parallel_for(X, L, 
+                     func(index int) {  //// block 1
+                       x = X[index]
+                       fluid.send(L[index], x)
+                       y = fluid.recv(L[index])
+                       Y[index] = y
+                     })
+}
+```
+
+An explanation of the above program:
+
+- `fluid.k8s` is a package that provides access to Kubernetes API.  
+- `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).  
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed, 
+
+  1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
+  2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread  
+     1. creates an Executor instance, and
+     2. calls `Executor.Run(block)`, where `block` is block 1 as explained above.
+1. Please be aware that block 1 is a sub-block of block 0, so ops in block 1 could refer to variables defined in block 0.
+
+### The Worker Program
+
+The worker program looks like
+
+```go
+func main() {
+  W = Tensor(...)
+  x = fluid.listen_and_do(
+        fluid.k8s.self_addr(),
+        func(input Tensor) {
+          output = fluid.mult(input, W)
+        })
+}
+```
+
+where
+
+- `fluid.listen_and_do` creates a `ListenAndDo` intrinsic, which, when executed,
+  1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`,
+  2. once a connection is established,
+     1. creates a scope of two parameters, "input" and "output",
+     2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h) and saves it into "input",
+     3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`.
+
+## Summarization
+
+From the above example, we see that:
+
+1. Fluid enables the imperative programming paradigm by:
+   1. letting users describe a program, but not a model (a sequence of layers, or a graph of operators), and
+   2. call the `fluid.run` function that runs the program implicitly.
+1. The program is described as a `ProgramDesc` protobuf message.
+2. Function `Executor.Run` takes a block, instead of a `ProgramDesc`, as its parameter.
+3. `fluid.run` calls `Executor.Run` to run the first block in the `ProgramDesc` message.
+4. `Executor.Run`'s implementation is extremely simple -- it doesn't plan the execution nor create threads; instead, it runs on the current thread and execute intrinsics/operators' `Run` method sequentially as they appear in the `Block.ops` array.
+5. Intrinsics/operators' `Run` method might create threads.  For example, the `ListenAndDo` operator creates a thread to handle each incoming request.
+6. Threads are not necessarily OS thread; instead, they could be [green threads](https://en.wikipedia.org/wiki/Green_threads) managed by ThreadPool.  Multiple green threads might run on the same OS thread.  An example green threads is Go's [goroutines](https://tour.golang.org/concurrency/1).
diff --git a/doc/design/dist_refactor/distributed_architecture.md b/doc/design/dist_refactor/distributed_architecture.md
new file mode 100644
index 0000000000000000000000000000000000000000..3a741f95866fb6c301ca9097af7916281f2278cf
--- /dev/null
+++ b/doc/design/dist_refactor/distributed_architecture.md
@@ -0,0 +1,197 @@
+# Design Doc: Distributed Training Architecture
+
+## Abstract
+
+PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations:
+
+1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc.
+
+2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers.
+
+3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries.
+
+This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations.
+
+## Analysis
+
+The assumption is that the user writes the trainer program in either Python or C++.
+
+### Limitation 1
+
+There are two basic functionalities in the trainer program:
+
+1. The training logic such as loading / saving the model and printing out the logs.
+2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the
+  optimizer.
+
+When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the
+training logic as well as the neural network computation logic, is replicated.
+
+The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not**
+replicate the training logic, the limitation mentioned above can be avoided.
+
+### Limitation 2
+
+Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the
+inter-model-shard communication between nodes.
+
+PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the
+computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup.
+
+Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
+
+<img src="src/compiler.png"/>
+
+PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
+
+<img src="src/paddle-compile.png"/>
+
+The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
+
+### Limitation 3
+
+The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
+
+This could be fixed by making the parameter server also run an IR, which can be different to the trainer side
+For a detailed explanation, refer to this document -
+[Design Doc: Parameter Server](./parameter_server.md)
+
+## Distributed Training Architecture
+
+The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
+
+<img src="src/distributed_architecture.png"/>
+
+The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
+
+### Python API
+
+Python API is the Python library that user's Python code invokes, to read the data, build the neural network topology, and start training, etc.
+
+```Python
+images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+...
+predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+optimizer.minimize(avg_cost)
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+for pass_id in range(10):
+    for data in train_reader():
+        loss, acc = exe.run(trainer_prog,
+                            feed=feeder.feed(data),
+                            fetch_list=[avg_cost])
+```
+
+The code above is a typical local training program, the "Training Program" is built using helper functions such as
+`fluid.layer.fc`. The training is done by calling `Executor.run`
+iteratively.
+
+For more details, the implementation of IR is [Program](../program.md), and `ProgramDesc` is the protobuf type.
+
+[Executor](../executor.md) simply runs the `ProgramDesc`. For local training you generally use
+`Executor` to run the program locally. For any kind of distributed training, you can use
+`RemoteExecutor` to specify desired distributed training method with some optional arguments.
+
+### Distributed Transpiler
+
+The Distributed Transpiler automatically converts the IR (in protobuf format) to partitioned IRs. Then
+the Remote Executor dispatches the new IRs to Remote Executors across the cluster.
+Below are the steps that are followed :
+
+1. User only need to change `Executor` to `RemoteExecutor` to change local program to distributed program.
+1. `RemoteExecutor` calls `Distributed Transpiler` to "transpile" user's program to several IRs representing a
+   distributed training program:
+   1. Parse configurations from `RemoteExecutor`.
+   1. Determine the type of distributed program, can be DataParallelism, ModelParallelism or Streaming.
+   1. Partition the `ProgramDesc` according to type and add `send` / `recv` OP pair on the boundaries. Take
+      DataParallelism type for example, it removes the optimization operators and add a `send` OP to the
+      "trainer" role, then add the optimization operators to the parameter server role within the `recv` OP.
+1. Dispatch the partitioned graph to different `RemoteExecutor` in the cluster.
+1. `RemoteExecutor` on each node run the received `ProgramDesc` utill the end.
+
+
+### RemoteExecutor
+
+As shown in the graph, `RemoteExecutor.run` sends the IR to the cluster for Execution.
+You can also use parameter `fetch_list` to interactively fetch variable back to local for
+log printing.
+
+The Python `RemoteExecutor` is derived from `Executor` class.
+
+```python
+exe = RemoteExecutor(
+    feed=feeder.feed(data),
+    fetch_list=[avg_cost],
+    job_desc=JobDesc(
+      jobname,
+      num_trainer,
+      num_pserver,
+      cpu_per_trainer,
+      gpu_per_trainer,
+      mem_per_trainer,
+      cpu_per_pserver,
+      mem_per_pserver
+    ))
+for data in train_reader():
+    loss, acc = exe.run(trainer_prog,
+                        feed=feeder.feed(data),
+                        fetch_list=[avg_cost])
+```
+
+`JobDesc` object describe the distributed job resource specification to run on
+Cluster environment.
+
+<img src="src/remote_executor.png"/>
+
+`RemoteExecutor.run` sends the `ProgramDesc` and
+[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource)
+to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
+to start the final Kubernetes Jobs to run the different role of `ProgramDesc`.
+
+
+### Placement Algorithm
+
+Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
+
+In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
+
+
+### Local Training Architecture
+
+The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
+
+<img src="src/local_architecture.png"/>
+
+
+### Training Data
+
+In PaddlePaddle v0.10.0, training data is typically read
+with [data reader](../reader/README.md) from Python. This approach is
+no longer efficient when training distributedly since the Python
+process no longer runs on the same node with the trainer processes,
+the Python reader will need to read from the distributed filesystem
+(assuming it has the access) and send to the trainers, doubling the
+network traffic.
+
+When doing distributed training, the user can still use Python data
+reader: the training data are sent with `Executor.run`. However, should
+be used for debugging purpose only. The users are encouraged to use
+the read data OPs.
+
+
+## References:
+
+[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
+
+[2] [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf)
diff --git a/doc/design/dist_refactor/multi_cpu.md b/doc/design/dist_refactor/multi_cpu.md
new file mode 100644
index 0000000000000000000000000000000000000000..a8d8ee0422acc84835170a44eb83f9b5f0c6bb40
--- /dev/null
+++ b/doc/design/dist_refactor/multi_cpu.md
@@ -0,0 +1,43 @@
+# Design Doc: Execute the Program with Multi CPU
+
+## Abstract
+
+This Design Doc propose an approach to make the user-defined Op graph
+running with multi-CPU, we will use an auto transpiler to convert the user-defined
+Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
+
+## Transpiler
+
+<img src="src/multi-threads/single-thread@3x.png" width="300">
+
+After converted:
+
+<img src="src/multi-threads/multi-threads@3x.png" width="1000">
+
+## Implement
+
+- `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph
+  which would be executed with multi-threads.
+- `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait`
+  for the atomic counter become `0`:
+  ```cpp
+  BlockingCounter bc(thread_count);
+  for (int i = 0; i < thread_count; ++i) {
+    thread_pool->Start([&bc] {bc.DecrementCount(); })
+  }
+  bc.Wait();
+  ```
+- `ParallelDo` Operator
+  - Initialize a thread pool which is a Singleton.
+  - Use a block id as the input, and create run the specify Block on independent scope
+    with multi-threads.
+  - Initialize a `BlockingCounter` instance and wait until all threads are done.
+- `Split` Operator will split the Input Tensor into a TensorArray.
+- `Merge` merge all the gradients which calculated in different threads
+  with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`.
+
+## TODO
+
+- Improve the optimizer stage with multi-threads, since we could
+  assign the parameters to the different threads and execute
+  optimizer with multi-threads.
diff --git a/doc/design/refactor/parameter_server.md b/doc/design/dist_refactor/parameter_server.md
similarity index 76%
rename from doc/design/refactor/parameter_server.md
rename to doc/design/dist_refactor/parameter_server.md
index fa3c5d7990213cf2b0d236e66e592dd2699da876..1094f06d461275a9ad4034d5e48b39856d967b71 100644
--- a/doc/design/refactor/parameter_server.md
+++ b/doc/design/dist_refactor/parameter_server.md
@@ -1,4 +1,4 @@
-# Design Doc: Operation Graph Based Parameter Server
+# Design Doc: Parameter Server
 
 ## Abstract
 
@@ -10,7 +10,7 @@ different purposes.
 ## Background
 
 The previous implementations of the parameter server does not run a
-subgraph. parameter initialization, optimizer computation, network
+fluid sub-program. Parameter initialization, optimizer computation, network
 communication and checkpointing are implemented twice on both the
 trainer and the parameter server.
 
@@ -23,10 +23,10 @@ server becomes a natural extension.
 
 ## Design
 
-### Graph Converter
+### Distributed Transpiler
 
-The *graph converter* converts the user-defined operation (OP) graph
-into subgraphs to be scheduled on different nodes with the following
+The *Distributed Transpiler* converts the user-defined fluid program
+into sub-programs to be scheduled on different nodes with the following
 steps:
 
 1. OP placement: the OPs will be placed on different nodes according
@@ -34,7 +34,6 @@ steps:
    time. Currently we will use a simple heuristic that puts parameter
    varable on parameter server workers and everything else on trainer
    workers.
-
 1. Add communication OPs to enable the communication between nodes.
 
 We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
@@ -48,8 +47,8 @@ After converting:
 
 <img src="src/dist-graph.png" width="700"/>
 
-1. The parameter variable W and it's optimizer subgraph are placed on the parameter server.
-1. Operators are added to the subgraphs.
+1. The parameter variable W and it's optimizer program are placed on the parameter server.
+1. Operators are added to the program.
    - *Send* sends data to the connected *Recv* operator.  The
 	 scheduler on the receive node will only schedule *Recv* operator
 	 to run when the *Send* operator has ran (the *Send* OP will mark
@@ -64,39 +63,30 @@ After converting:
 ### Benefits
 
 - Model parallelism become easier to implement: it's an extension to
-  the trainer - parameter server approach. we already have the
-  communication OPs, but need to extend the graph converter's
-  placement functionality.
-
+  the trainer - parameter server approach. We can have several "Transpilers"
+  to achieve different goals.
 - User-defined optimizer is easier to add - user can now express it as
-  a subgraph.
-
+  a sub-program.
 - No more duplication logic inside the trainer and the parameter
   server mentioned in the background section.
 
 ### Challenges
 
-- It might be hard for the graph converter to cut a general graph
-  (without any hint for which subgraph is the optimizer). We may need
-  to label which subgraph inside the OP graph is the optimizer.
-
 - It's important to balance the parameter shards of on multiple
   parameter server. If a single parameter is very big (some
   word-embedding, fully connected, softmax layer), we need to
   automatically partition the single parameter onto different
   parameter servers when possible (only element-wise optimizer depends
   on the parameter variable).
+- In the "Aync SGD" figure, the "W" variable on the parameter server
+  could be read and wrote concurrently. See
+  [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
+  details about concurrent program in fluid.
 
 ### Discussion
 
-- In the "Aync SGD" figure, the "W" variable on the parameter server
-  could be read and wrote concurrently, what is our locking strategy?
-  E.g., each variable have a lock cpp method to be invoked by every
-  OP, or, have a lock OP.
-
 - Can the Enqueue OP be implemented under our current tensor design
   (puts the input tensor into the queue tensor)?
-
 - *Dequeue* OP will have variable numbers of output (depends on the
   `min_count` attribute), does our current design support it? (similar
   question for the *Add* OP)
diff --git a/doc/design/refactor/src/compiler.graffle b/doc/design/dist_refactor/src/compiler.graffle
similarity index 100%
rename from doc/design/refactor/src/compiler.graffle
rename to doc/design/dist_refactor/src/compiler.graffle
diff --git a/doc/design/refactor/src/compiler.png b/doc/design/dist_refactor/src/compiler.png
similarity index 100%
rename from doc/design/refactor/src/compiler.png
rename to doc/design/dist_refactor/src/compiler.png
diff --git a/doc/design/refactor/src/dist-graph.graffle b/doc/design/dist_refactor/src/dist-graph.graffle
similarity index 100%
rename from doc/design/refactor/src/dist-graph.graffle
rename to doc/design/dist_refactor/src/dist-graph.graffle
diff --git a/doc/design/refactor/src/dist-graph.png b/doc/design/dist_refactor/src/dist-graph.png
similarity index 100%
rename from doc/design/refactor/src/dist-graph.png
rename to doc/design/dist_refactor/src/dist-graph.png
diff --git a/doc/design/dist_refactor/src/distributed_architecture.graffle b/doc/design/dist_refactor/src/distributed_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d1b60141342232e06227c2d430ebc60ec349a907
Binary files /dev/null and b/doc/design/dist_refactor/src/distributed_architecture.graffle differ
diff --git a/doc/design/dist_refactor/src/distributed_architecture.png b/doc/design/dist_refactor/src/distributed_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..29c7b0c0783f97c6d33b1db1ed484d6a2b9dd356
Binary files /dev/null and b/doc/design/dist_refactor/src/distributed_architecture.png differ
diff --git a/doc/design/refactor/src/local-graph.graffle b/doc/design/dist_refactor/src/local-graph.graffle
similarity index 100%
rename from doc/design/refactor/src/local-graph.graffle
rename to doc/design/dist_refactor/src/local-graph.graffle
diff --git a/doc/design/refactor/src/local-graph.png b/doc/design/dist_refactor/src/local-graph.png
similarity index 100%
rename from doc/design/refactor/src/local-graph.png
rename to doc/design/dist_refactor/src/local-graph.png
diff --git a/doc/design/dist_refactor/src/local_architecture.graffle b/doc/design/dist_refactor/src/local_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..49fcc663ebe3824aa234e3a67aadf285cb417877
Binary files /dev/null and b/doc/design/dist_refactor/src/local_architecture.graffle differ
diff --git a/doc/design/dist_refactor/src/local_architecture.png b/doc/design/dist_refactor/src/local_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..14adc9fd72b855bb9f74fbf2c84ac9ec0cf2b122
Binary files /dev/null and b/doc/design/dist_refactor/src/local_architecture.png differ
diff --git a/doc/design/dist_refactor/src/multi-threads.graffle b/doc/design/dist_refactor/src/multi-threads.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..e71173715fff92a0a933d0c7d83599ba948552c6
Binary files /dev/null and b/doc/design/dist_refactor/src/multi-threads.graffle differ
diff --git a/doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png b/doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..e40a869987dbbf5019d4cb03c1dab55b74d6c9f9
Binary files /dev/null and b/doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png differ
diff --git a/doc/design/dist_refactor/src/multi-threads/single-thread@3x.png b/doc/design/dist_refactor/src/multi-threads/single-thread@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..4083aebfdd45af5fbac25fa2c4176bc08c3cb44a
Binary files /dev/null and b/doc/design/dist_refactor/src/multi-threads/single-thread@3x.png differ
diff --git a/doc/design/refactor/src/paddle-compile.graffle b/doc/design/dist_refactor/src/paddle-compile.graffle
similarity index 100%
rename from doc/design/refactor/src/paddle-compile.graffle
rename to doc/design/dist_refactor/src/paddle-compile.graffle
diff --git a/doc/design/refactor/src/paddle-compile.png b/doc/design/dist_refactor/src/paddle-compile.png
similarity index 100%
rename from doc/design/refactor/src/paddle-compile.png
rename to doc/design/dist_refactor/src/paddle-compile.png
diff --git a/doc/design/dist_refactor/src/remote_executor.graffle b/doc/design/dist_refactor/src/remote_executor.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..ce2c18fee5687732053c48af9c8c290a994a8090
Binary files /dev/null and b/doc/design/dist_refactor/src/remote_executor.graffle differ
diff --git a/doc/design/dist_refactor/src/remote_executor.png b/doc/design/dist_refactor/src/remote_executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..6be4b1841b99efdb59557975485d0387f422308c
Binary files /dev/null and b/doc/design/dist_refactor/src/remote_executor.png differ
diff --git a/doc/design/error_clip.md b/doc/design/error_clip.md
new file mode 100644
index 0000000000000000000000000000000000000000..8e845462cce2a29556bcb6010b08f00fbc3d99d7
--- /dev/null
+++ b/doc/design/error_clip.md
@@ -0,0 +1,87 @@
+# Error Clip
+
+## Overview
+
+Error clip is widely used in model training to prevent gradient exploding. It takes some specific rules to adjust variables' gradients and prevent them from being too large. With it, values of a gradient will be checked before they are taken by the next `grad_op` and be shrunk if necessary.
+## Usage
+
+Users are allowed to assign different error clip methods or attributes to different `Variable`s. Users can specify it as a parameter of `Variable`'s constructor:
+
+```python
+var = framework.Variable(..., error_clip=myErrorClip, ...)
+```
+
+The default value of `error_clip` is `None`, which means no error clip is employed. When it's not `None`, it should take an object of `BaseErrorClipAttr`'s derived class. So far, `BaseErrorClipAttr` has only one derived class: `ErrorClipByValue`, whose constructor is:
+
+```python
+ErrorClipByValue(max, min=None)
+```
+
+`max` and `min` represent the maximal and minimal clip threshold respectively. In backward pass, all values of `var`'s gradient greater than `max` or less than `min` will be clipped to `max` and `min` respectively. When the `min` is None, the minimal threshold will be assigned with `-max` automatically.
+
+So we can enable the error clip with threshold `[-5.0, 5.0]` for variable `var` by:
+
+```python
+var = framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
+```
+
+## Implementation
+
+The `BaseErrorClipAttr` and its derived class `ErrorClipByValue` are defined in *clip.py*.
+
+```python
+class BaseErrorClipAttr(object):
+    def append_clip_op(self, block, grad_name):
+        raise NotImplementedError()
+
+
+class ErrorClipByValue(BaseErrorClipAttr):
+    def __init__(self, max, min=None):
+        max = float(max)
+        if min is None:
+            min = -max
+        else:
+            min = float(min)
+        self.max = max
+        self.min = min
+
+    def append_clip_op(self, block, grad_name):
+        block.append_op(
+            type="clip",
+            inputs={"X": grad_name},
+            outputs={"Out": grad_name},
+            attrs={"min": self.min,
+                   "max": self.max})
+```
+
+The `BaseErrorClipAttr` have one main member functions: `append_clip_op(self, block, grad_name)`.
+
+This function is used to create a `clip_op` and append it to the end of given `block`. For different error clip algorithm require different `clip_op`, the function is defined as virtual in the base class. All derived classes must implement their own versions of this function.
+
+These `clip_op`s should be inserted after `grad_op`s whose output gradients need to be clipped. It is equivalent to appending some `clip_op`s to the end of the target block every time a new `grad_op` is added.
+
+```python
+for op_desc in grad_op_descs:
+        new_op_desc = target_block.desc.append_op()
+        new_op_desc.copy_from(op_desc)
+        callback(block=target_block, context=grad_to_var)
+```
+
+Here we employ a callback function to complete this kind of jobs. In `_append_backward_ops_` function, each time after a `grad_op` is added to the `target_block`, a callback function is invoked. The logic of `clip_op` appending can be implemented inside the callback function.
+
+The callback function for `clip_op` appending is defined in *clip.py*:
+
+```python
+def error_clip_callback(block, context):
+    # the context is a grad_to_var map
+    grad_to_var = context
+    op_desc = block.desc.op(block.desc.op_size() - 1)
+    for grad_n in filter(lambda n: grad_to_var.has_key(n),
+                         op_desc.output_arg_names()):
+        fwd_var = block.var_recursive(grad_to_var[grad_n])
+        error_clip = getattr(fwd_var, "error_clip", None)
+        if error_clip is not None:
+            error_clip.append_clip_op(block, grad_n)
+```
+
+This function takes a `block` and a `context`(which is actually a grad\_to\_var map) as inputs. It checks each output of the last `OpDesc` in the `block`. Notice that the last `OpDesc` of the `block` must be a `grad_op` and its outputs must be some forward variables' gradients. If an output gradient's corresponding forward variable has an attribute of `error_clip`, `error_clip_callback` will call the `error_clip`'s `append_clip_op` function to append the required `clip_op` into the `block`.
diff --git a/doc/design/evaluator.md b/doc/design/evaluator.md
index a62d75ffef14962aec8c7587e172d78dfe0cb4be..11cc129d56905a9ee666da92fbe6f8559c6d325a 100644
--- a/doc/design/evaluator.md
+++ b/doc/design/evaluator.md
@@ -1,22 +1,22 @@
 ## Evaluator Design
 
-### The Problem
+### Problem Statement
 
-During training or serving, we provide the evaluation function to measure the model performance, e.g., accuracy, precision. In the operator based framework design, the data go through the network pipeline batch by batch. As a result, inside the operator, we only can calculate one minibatch metrics. We need to provide a mechanism to calculate the metrics for each N pass/batch the user wanted.
+During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
 
 ### Evaluator Design
-Currently, every operation is expressed in the graph. we divide the evaluator process into three steps.
+Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
 
 1. Initialize the metric state and add it into the block.
 
-2. Calculate the statistic of the metric state in every mini-batch. The single operator is only responsible for calculating necessary statistics for one mini-batch. For example, accuracy operator only calculate a minibatch data if run once.
+2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once.
 
 
 3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
 
 ### Implementation
-This design is shown in python API. 
-Each metric operator need to caculate the metric statistic and return the batch aware states, Python side responsible for accumulate the states for each pass. 
+This design is shown in the Python API. 
+Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. 
 
     
 ```python
diff --git a/doc/design/executor.md b/doc/design/executor.md
index b5fb6c5c3c1da3c112ce63878322083dd5c42b70..2d4b371cc56db82ce5747da6db07f05aa7f7e6c1 100644
--- a/doc/design/executor.md
+++ b/doc/design/executor.md
@@ -1,23 +1,29 @@
 # Executor Design Doc
 
 ## Motivation
+In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
+[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
 
-We use executor to do the runtime evaluation of a `ProgramDesc`.
+The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
 
 ## Overview
 
-An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
+An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
 
-### What does executor do?
+## Executor
 
-It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
+The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
+It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
 
-### What does executor NOT do?
+### The interface
+```c++
+  Executor(places);
+```
+A executor does not own any computing resources, a user can only construct an executor using the specified places.
 
-It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
+### Running an Executor
 
-It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
-
-## Implementation
-
-`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
+```
+  void Run(ProgramDesc, Scope, block_id, create_local_scope);
+```
+An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
diff --git a/doc/design/float16.md b/doc/design/float16.md
index 078801ba2ed969d26dd31d5ec4ed268686cf7016..1ea95ed6b5d6792171569b6ff76d09be92fcb13e 100644
--- a/doc/design/float16.md
+++ b/doc/design/float16.md
@@ -28,6 +28,51 @@ The goal of float16 is to serve as a key for the executor to find and run the co
 - [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
 - [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
 
+### CUDA version issue
+There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. 
+CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
+```
+typedef struct __align__(2) {
+   unsigned short x;
+} __half;
+
+typedef __half half;
+```
+This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
+```
+__global__ void Add() {
+  half a, b, c;
+  c = __hadd(a, b); // correct
+  c = a + b; // compiler error: no operator "+" matches these operands
+}
+```
+CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
+
+Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
+```
+typedef struct __CUDA_ALIGN__(2) {
+    unsigned short x;
+} __half_raw;
+
+
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    unsigned short __x;
+public:
+    // constructors and conversion operators from/to 
+    // __half_raw and other built-in data types
+}
+
+typedef __half half;
+
+__device__ __forceinline__ 
+__half operator+(const __half &lh, const __half &rh) { 
+    return __hadd(lh, rh); 
+}
+
+// Other overloaded operators
+``` 
+This new design makes `c = a + b` work correctly for CUDA half data type. 
 
 ## Implementation
 The float16 class holds a 16-bit `uint16_t` data internally.
diff --git a/doc/design/fluid-compiler.graffle b/doc/design/fluid-compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c933df2cb855462c52b2d25f7f9a99b95652961d
Binary files /dev/null and b/doc/design/fluid-compiler.graffle differ
diff --git a/doc/design/fluid-compiler.png b/doc/design/fluid-compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..1b0ffed2039c91a3a00bbb719da08c91c3acf7bb
Binary files /dev/null and b/doc/design/fluid-compiler.png differ
diff --git a/doc/design/fluid.md b/doc/design/fluid.md
new file mode 100644
index 0000000000000000000000000000000000000000..585dc8ef39c0cfb30f470d79f7b27a59ceb5e940
--- /dev/null
+++ b/doc/design/fluid.md
@@ -0,0 +1,122 @@
+# Design Doc: PaddlePaddle Fluid
+
+## Why Fluid
+
+When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe.  However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework?
+
+Fluid is the answer.  Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model.  In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no  concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning.
+
+## The Evolution of Deep Learning Systems
+
+Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
+
+| Existed since | model as sequence of layers | model as graph of operators | No model |
+|--|--|--|--|
+| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
+| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
+| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
+
+From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model.  To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
+
+## Deep Learning Programming Paradigms
+
+With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following:
+
+```python
+x = layer.data("image")
+l = layer.data("label")
+f = layer.fc(x, W)
+s = layer.softmax(f)
+c = layer.mse(l, s)
+
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    forward({input=x, data=m}, minimize=c)
+    backward(...)
+
+print W # print the trained model parameters.
+```
+
+The above program includes two parts:
+
+1. The first part describes the model, and
+2. The second part describes the training process (or inference process) for the model.
+
+This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt.
+
+This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general,  prefer PyTorch over the older systems.  Using PyTorch, we would write the above program as following:
+
+```python
+W = tensor(...)
+
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    x = m["image"]
+    l = m["label"]
+    f = layer.fc(x, W)
+    s = layer.softmax(f)
+    c = layer.mse(l, s)
+    backward()
+
+print W # print the trained model parameters.
+```
+
+We can see that the main difference is the moving the model configuration part (the first step) into the training loop.  This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block.  This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop.
+
+## Describe Arbitrary Models for the Future
+
+Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet.
+
+As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator.  A PyTorch example would look like the following:
+
+```python
+for i in xrange(1000):
+    m = read_minibatch()
+    x = m["sentence"]
+    for t in xrange x.len():
+        h[t] = the_step(x[t])
+```        
+
+With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following:
+
+```python
+train_loop = layers.While(cond)
+with train_loop.block():
+  m = read_minibatch()
+  x = m["sentence"]
+  rnn = layers.While(...)
+  with rnn.block():
+    h[t] = the_step(input[t])
+```    
+
+An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44).
+
+From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
+
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
+
+## Turing Completeness
+
+In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine.  For a programming language, if it provides if-then-else and loop, it is Turing complete.  From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there  is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete.  Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth.
+
+## The Execution of a Fluid Program
+
+There are two ways to execute a Fluid program.  When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+
+There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
+
+Fluid is moving towards the direction of a compiler, which is explain in more detail later in this article.
+
+## Backward Compatibility of Fluid
+
+Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference.  For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph).  Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators.  The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators.
+
+For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format.
+
+## Towards a Deep Learning Language and the Compiler
+
+We can change the `if-then-else` and loop structure a little bit in the above Fluid example programs, to make it into a new programming language, different than Python.
+
+Even if we do not invent a new language, as long as we get the `ProgramDesc` message filled in, we can write a transpiler, which translates each invocation to an operator, into a C++ call to a kernel function of that operator. For example, a transpiler that weaves the CUDA kernels outputs an NVIDIA-friendly C++ program, which can be built using `nvcc`.  Another transpiler could generate MKL-friendly code that should be built using `icc` from Intel.  More interestingly, we can translate a Fluid program into its distributed version of two `ProgramDesc` messages, one for running on the trainer process, and the other one for the parameter server.  For more details of the last example, the [concurrent programming design](concurrent_programming.md) document would be a good pointer.  The following figure explains the proposed two-stage process:
+
+![](fluid-compiler.png)
diff --git a/doc/design/images/control_flow_graph.png b/doc/design/images/control_flow_graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3579998e58d07abc50bd3332128d4733a391cb3b
Binary files /dev/null and b/doc/design/images/control_flow_graph.png differ
diff --git a/doc/design/images/dataflow_equations.png b/doc/design/images/dataflow_equations.png
new file mode 100644
index 0000000000000000000000000000000000000000..c10f7f69f4007952e5b0394edaa04efa1cfbb658
Binary files /dev/null and b/doc/design/images/dataflow_equations.png differ
diff --git a/doc/design/images/deep_learning.png b/doc/design/images/deep_learning.png
new file mode 100644
index 0000000000000000000000000000000000000000..026becc4d94e01e407dacb2a5314a0e5723334ff
Binary files /dev/null and b/doc/design/images/deep_learning.png differ
diff --git a/paddle/framework/images/duplicate_op.graffle b/doc/design/images/duplicate_op.graffle
similarity index 100%
rename from paddle/framework/images/duplicate_op.graffle
rename to doc/design/images/duplicate_op.graffle
diff --git a/paddle/framework/images/duplicate_op.png b/doc/design/images/duplicate_op.png
similarity index 100%
rename from paddle/framework/images/duplicate_op.png
rename to doc/design/images/duplicate_op.png
diff --git a/paddle/framework/images/duplicate_op2.graffle b/doc/design/images/duplicate_op2.graffle
similarity index 100%
rename from paddle/framework/images/duplicate_op2.graffle
rename to doc/design/images/duplicate_op2.graffle
diff --git a/paddle/framework/images/duplicate_op2.png b/doc/design/images/duplicate_op2.png
similarity index 100%
rename from paddle/framework/images/duplicate_op2.png
rename to doc/design/images/duplicate_op2.png
diff --git a/doc/design/images/multigpu_allreduce.graffle b/doc/design/images/multigpu_allreduce.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..cb5bc420ceafe8ba4c87694d44ee4e5e4ad06779
Binary files /dev/null and b/doc/design/images/multigpu_allreduce.graffle differ
diff --git a/doc/design/images/multigpu_allreduce.png b/doc/design/images/multigpu_allreduce.png
new file mode 100644
index 0000000000000000000000000000000000000000..87a1b3e8f6dd4a713ec9df9f0037d1da04e9178a
Binary files /dev/null and b/doc/design/images/multigpu_allreduce.png differ
diff --git a/doc/design/images/multigpu_before_convert.graffle b/doc/design/images/multigpu_before_convert.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..6c35ab1b21fb76ceae82d3693ed0d085b5bc0855
Binary files /dev/null and b/doc/design/images/multigpu_before_convert.graffle differ
diff --git a/doc/design/images/multigpu_before_convert.png b/doc/design/images/multigpu_before_convert.png
new file mode 100644
index 0000000000000000000000000000000000000000..9c8f7711165d80a2fa3911280fdee91855a401b1
Binary files /dev/null and b/doc/design/images/multigpu_before_convert.png differ
diff --git a/doc/design/images/profiler.png b/doc/design/images/profiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..d57b71ca88aaba5d05584a6219d84214e285a1e1
Binary files /dev/null and b/doc/design/images/profiler.png differ
diff --git a/doc/design/kernel_hint_design.md b/doc/design/kernel_hint_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..a54b7da045e1a362626ef066f9ebb56af2c3181a
--- /dev/null
+++ b/doc/design/kernel_hint_design.md
@@ -0,0 +1,57 @@
+## Problem
+In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
+
+In the current design, we use KernelType to describe one kernel.
+
+```cpp
+struct KernelType {
+  Place place_;
+  DataType data_type_;
+  LayoutType layout_;
+};
+```
+ `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
+
+The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
+
+So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
+
+The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
+
+## Solution
+
+### Potential choice
+1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
+
+2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
+
+### Final choice
+To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
+
+In C++
+
+```cpp
+const std::string kForceCPU = "force_cpu";
+const std::string kUseCUDNN = "use_cudnn";
+const std::string kUseMKLDNN = "use_mkldnn";
+
+KernelType GetExpectedKernelType() {
+  if (Attr<bool>(kForceCPU)) {
+    return KernelType(CPUPlace, ...)
+  } else {
+    ...
+  }
+}
+```
+
+In Python code
+
+```python
+FORCE_CPU = core.kForceCPU()
+
+def xx_layer(..., force_cpu=false):
+  layer_helper = LayerHelper(...)
+  layer_helper.append_op(
+    type="xx",
+    attr={FORCE_CPU: force_cpu})
+```
diff --git a/doc/design/memory_optimization.md b/doc/design/memory_optimization.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f68cef4cc28cd005acbeaa5c03cc0d84a83939c
--- /dev/null
+++ b/doc/design/memory_optimization.md
@@ -0,0 +1,217 @@
+# Memory Optimization
+
+
+## Problem
+
+In a lecture from Andrew Ng, he attributes the recent sucess of AI due to a combination of these:
+
+- Availability of Big Data
+- Supercomputing power to process this Big Data over very large neural networks
+- Modern algorithms
+
+Following graph shows the details:
+
+![](images/deep_learning.png)
+
+Larger model usually bring better performance. However, GPU memory is limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large models, we have to take care of memory usage. Besides, memory optimization is also necessary in both online/mobile inference. 
+
+## Solution
+
+### Basic Strategy
+
+There are some basic strategies to improve memory usage, including in-place operations and memory sharing.
+
+#### In-place Operation
+In a relu activation operator： 
+
+$y = \max(x, 0)$
+
+If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x will be the same. In-place operations will save 50% memory occupancy immediately.
+
+#### Memory Sharing
+
+Not all operators support in-place operations. Memory sharing is a more general strategy.
+
+Following is an example:
+
+```
+a = op1(b, c);
+d = op2(a)
+e = op3(d, f)
+```
+
+In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finishes, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
+
+
+### Live Variable Analysis
+
+It's not enough to only have some basic strategies. The pre-requisite of memory optimization is to know if a variable is still "live" after an operation.
+
+In our design, the neural network topology is defined as a program. Luckily, [live variable analysis](https://en.wikipedia.org/wiki/Live_variable_analysis) is a classic problem in compilers which can be used in many stages, such as register allocation. 
+
+In compilers, the front end of the compiler translates programs into an intermediate language with an unbounded number of temporary variables. This program must run on a machine with a bounded number of registers. Two temporary variables a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporary variables can fit in few registers; if they don't all fit, the excess tempory variables can be kept in memory.
+
+Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporary variables are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis. 
+
+We can leran these techniques from compilers. There are mainly two stages to make live variable analysis:
+
+- construct a control flow graph
+- solve the dataflow equations
+
+
+#### Control Flow Graph
+To perform analysis on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
+
+Following is the flow graph for a simple loop.
+
+![](images/control_flow_graph.png)
+
+#### Dataflow Analysis
+
+Liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
+
+A simple way to perform data-flow analysis of programs is to set up dataflow equations for each node of the control flow graph and solve them by repeatedly calculating the output from the input locally at each node until the whole system stabilizes.
+
+- Flow Graph Terminology
+
+A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from predecessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
+In former control flow graph, the out-edges of node 5 are 5 --> 6 and 5 --> 2, and *succ[5]* = {2, 6}. The in-edges of 2 are 5 --> 2 and 1 --> 2, and *pred[2]* = {1, 5}.
+
+- Uses and Defs
+
+An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can define the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
+
+- Liveness
+
+A variable is *live* on an edge if there is a directed path from that edge to a *use* of the variable that does not go through any *def*. A variable is *live-in* at a node if it is live on any of the in-edges of that node; it is *live-out* at a node if it is live on any of the out-edges of the node.
+
+
+The calcution of liveness can be solved by iteration until a fixed pointer is reached. Following is the recursive formula:
+
+![](images/dataflow_equations.png)
+
+### Memory optimization transpiler
+
+At last, we take basic strategy and liveness analysis techniques learning from compilers to implement our memory optimization transpiler.
+
+#### add in-place attribute
+
+In-place is a built-in attribute of an operator. Since we treat in-place and other operators differently, we have to add an in-place attribute for every operator.
+
+
+#### contruct control flow graph
+
+Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book/test_machine_translation.py) example.
+
+- Block0:
+
+```
+lookup_table
+mul
+...
+while(sub-block idx 1)
+...
+array_to_lod_tensor
+cross_entropy
+...
+while_grad(sub-block idx 2)
+read_from_array
+array_to_lod_tensor
+...
+```
+
+- Block1
+
+```
+read_from_array
+read_from_array
+...
+write_to_array
+increment
+write_to_array
+less_than
+```
+
+- Block2
+
+```
+read_from_array
+increment
+...
+write_to_array
+write_to_array
+```
+
+We can transfer all the operators and variables in ProgramDesc to build a control flow graph.
+
+```python
+class ControlFlowGraph(object):
+    def __init__(self, Program):
+        self._sucessors = defaultdict(set)
+        self._presucessors = defaultdict(set)
+        self._uses = defaultdict(set)
+        self._defs = defaultdict(set)
+        self._live_in = defaultdict(set)
+        self._live_out = defaultdict(set)
+        self._program = Program
+    
+    def build(self):
+        pass
+    
+    def dataflow_analysis(self):
+        pass
+        
+    def memory_optimization(self):
+        pass
+        
+    def get_program(self):
+        return self._program
+```
+
+#### Make dataflow analysis
+
+We follow the guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing. 
+
+For example:
+
+```
+a = op1(b, c);
+d = op2(a)
+e = op3(d, f)
+```
+
+The dataflow analysis result is:
+
+```
+live_in(op1) = {b, c, f}
+live_out(op1) = {a, f}
+
+live_in(op2) = {a, f}
+live_out(op2) = {d, f}
+
+live_in(op3) = {d, f}
+live_out(op3) = {}
+```
+
+After op1, we can process variable b and variable c; After op2, we can process variable a. After op3, we can process variable d and variable f.
+
+#### memory sharing policy
+
+A memory pool will be mantained in the stage of memory optimization. Each operator node will be scanned to determine memory optimization is done or not. If an operator satifies the requirement, following policy will be taken to handle input/output variables.
+
+```
+if op.support_inplace():
+    i --> pool
+    pool --> o
+else:
+    pool --> o
+    i --> pool
+```
+
+
+
+## Reference
+
+- [Lecture Notes From Artificial Intelligence Is The New Electricity By Andrew Ng](https://manavsehgal.com/lecture-notes-from-artificial-intelligence-is-the-new-electricity-by-andrew-ng-4712dcbf26e5)
+- Modern compiler implementation in ML, by Andrew W. Appel
+- [Optimizing Memory Consumption in Deep learning](https://mxnet.incubator.apache.org/architecture/note_memory.html)
diff --git a/doc/design/mkl/image/engine.png b/doc/design/mkl/image/engine.png
new file mode 100644
index 0000000000000000000000000000000000000000..1f5f65c2cc765a514a3ba9e7b7f468e1dc4b0c3b
Binary files /dev/null and b/doc/design/mkl/image/engine.png differ
diff --git a/doc/design/mkl/image/gradients.png b/doc/design/mkl/image/gradients.png
new file mode 100644
index 0000000000000000000000000000000000000000..f031bcf8e4cec14e63075b8b9d2c7bbd9f1b1a3c
Binary files /dev/null and b/doc/design/mkl/image/gradients.png differ
diff --git a/doc/design/mkl/image/layers.png b/doc/design/mkl/image/layers.png
new file mode 100644
index 0000000000000000000000000000000000000000..306f79b7a844610915eb8944128f57d2b7a3065a
Binary files /dev/null and b/doc/design/mkl/image/layers.png differ
diff --git a/doc/design/mkl/image/matrix.png b/doc/design/mkl/image/matrix.png
new file mode 100644
index 0000000000000000000000000000000000000000..c33ce9cf0335e47cc8c1253304d0fe179186e6f2
Binary files /dev/null and b/doc/design/mkl/image/matrix.png differ
diff --git a/doc/design/mkl/image/overview.png b/doc/design/mkl/image/overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..8fb7bbb9dd654bf363d701d0c8cd4a557043d188
Binary files /dev/null and b/doc/design/mkl/image/overview.png differ
diff --git a/doc/design/mkl/mkl_packed.md b/doc/design/mkl/mkl_packed.md
new file mode 100644
index 0000000000000000000000000000000000000000..0123315ad4368e68b377f66119949bfd6c1c7860
--- /dev/null
+++ b/doc/design/mkl/mkl_packed.md
@@ -0,0 +1,108 @@
+# Intel® MKL Packed on PaddlePaddle: Design Doc
+
+
+## Contents
+
+- [Overview](#overview)
+- [Key Points](#key-points) 
+   - [Background](#background)
+   - [Solution](#solution)
+- [Actions](#actions)
+    - [CMake](#cmake)
+	- [Layers](#layers)
+	- [Unit Tests](#unit-tests)
+	- [Python API](#python-api)
+	- [Benchmarking](#benchmarking)
+
+
+## Overview
+我们计划将 Intel® MKL 中引入的 GEMM Packed APIs\[[1](#references)\] 集成到 PaddlePaddle 中，充分发挥英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+现阶段的优化主要针对 Recurrent Neural Network（以下简称RNN）相关层（包括`RecurrentLayer`, `GatedRecurrentLayer`和`LstmLayer`）， 以及 PaddlePaddle V1 API。
+
+## Key Points
+
+### Background
+目前PaddlePaddle采用了 Intel® MKL库的[cblas_?gemm](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm)函数，这个函数本身会在计算前将原数据转换为更适合英特尔平台的内部格式。
+
+1. 转换耗时 \
+这一数据格式的转换操作（Packing），在问题本身的计算量比较小的时候，显得相对来说较为耗时。例如在DeepSpeech2 \[[2](#references)\] 的Vanilla RNN部分中，矩阵大小是`batch_size * 2048`。
+2. 转换冗余 \
+由于在现有的某些情况下（例如RNN），多次调用 cblas_?gemm 会使用相同的原数据，因此，每次调用时对原数据的重复Packing便成为了冗余。
+
+为了最大程度减少多次调用 cblas_?gemm 在Packing上的耗时，Intel® MKL 引入了以下四个API:
+   * [cblas_?gemm_alloc](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-alloc)
+   * [cblas_?gemm_pack](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-pack)
+   * [cblas_?gemm_compute](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-compute)
+   * [cblas_?gemm_free](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-free)
+
+通过使用这些API，我们可以先完成对原数据的Packing操作，再把已转换为Packed格式的数据传递给那些复用同一数据的gemm_compute函数，从而避免了Packing冗余。
+
+### Solution
+在RNN的情况下，同一次前向、后向（forward/backward）过程中所有时间步（time step）共享同一个权重（weight）。当只做推断（inference）时，各次前向之间也都使用了相同的权重，没有必要在每次前向中每个时间步的计算时对权重进行重复的Packing操作。
+
+我们通过使用新引入的GEMM Packed APIs，在层初始化的时候，先完成对权重的Packing操作，然后在前向，后向时复用已经转换过的权重，并在每次权重更新后，对新的权重进行转换用于下次迭代。
+
+* 优化前，对于序列长度（sequence length）为`T`的网络模型（model）, `N`次迭代执行的转换次数为：
+  - `inference`： `N * T`  
+  - `training`： `2 * N * T`
+* 优化后，对于同样设置的网络模型，其转换次数减少至：
+  - `inference`： `1`    
+  - `training`： `2 * N`
+
+## Actions
+
+添加的相关文件和目录结构如下：
+
+```txt
+PaddlePaddle/Paddle
+├── ...
+└── paddle/
+    ├── ...
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   ├── MKLPackedRecurrentLayer.*
+        |   ├── MKLPackedGatedRecurrentLayer.*
+        |   ├── MKLPackedLstmLayer.*
+        |   └── MKLPackedGemm.h
+        └── tests/
+            ├── ...
+            └── test_MKLPacked.cpp
+```
+
+### CMake
+在对应的`CMakeLists.txt`中根据`WITH_MKL`是否打开，来决定是否开启MKL Packed相关功能。
+
+### Layers
+所有的`MKLPacked*Layer`都继承于PaddlePaddle的基类`Layer`, 并添加头文件 `MKLPackedGemm.h`，该文件对相关GEMM Packed APIs做了封装。
+
+### Unit Tests
+我们会添加`test_MKLPacked.cpp`用于MKL Packed优化后layer的测试。
+对于每一个新加的RNN layer，我们会对比如下2个方面：
+1. 对比优化后layer自身，sequence mode（`rnn_use_batch=false`）与batch mode(`rnn_use_batch=true`)的结果。
+2. 对比优化后layer与相对应的PaddlePaddle原有layer, 在batch mode下的结果。
+
+### Python API
+计划在`paddle/utils.Flags`中添加`use_mkl_packed`的flag，用于选择是否使用相关功能，并且当编译时`WITH_MKL=ON`的情况下，默认设置为`true`。
+
+同时，在`python/paddle/trainer/config_parser.py`中对应的layer处，添加`use_mkl_packed`这个选择，方便用户在Python端选择是否启用这个功能。
+
+具体实现方式比如：
+
+```python
+use_mkl_packed = bool(int(g_command_config_args.get("use_mkl_packed", 0)))
+if use_mkl_packed:
+    self.layer_type = mkl_packed_*
+```
+
+所有相关的`layer_type`会以*mkl_packed_*开头，这些会在`MKLPacked*Layer`注册layer的时候保证，以示区分。 
+
+
+### Benchmarking
+会添加相应的脚本用于测试和对比在使用MKL Packed recurrent layers 前后的网络性能。
+
+## References 
+1. [Introducing the new Packed APIs for GEMM](https://software.intel.com/en-us/articles/introducing-the-new-packed-apis-for-gemm)
+2. [DeepSpeech2 on PaddlePaddle](https://github.com/PaddlePaddle/DeepSpeech#deepspeech2-on-paddlepaddle)
+
diff --git a/doc/design/mkl/mkldnn.md b/doc/design/mkl/mkldnn.md
new file mode 100644
index 0000000000000000000000000000000000000000..e2fe1e6b26ffa73fda81863abfadf697c0acbfcf
--- /dev/null
+++ b/doc/design/mkl/mkldnn.md
@@ -0,0 +1,210 @@
+# Intel® MKL-DNN on PaddlePaddle: Design Doc
+
+我们计划将英特尔深度神经网络数学库[Intel MKL-DNN](https://github.com/01org/mkl-dnn)
+(Intel Math Kernel Library for Deep Neural Networks)集成到PaddlePaddle，
+充分展现英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+
+<div align="center">
+<img src="image/overview.png"><br/>
+Figure 1. PaddlePaddle on IA
+</div>
+
+近期目标
+
+- 完成常用Layer的MKL-DNN实现。
+- 完成常见深度神经网络VGG，GoogLeNet 和 ResNet的MKL-DNN实现。
+
+目前的优化，主要针对PaddlePaddle在重构之前的代码框架以及V1的API。
+具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)。
+
+## Contents
+
+- [Overview](#overview)
+- [Actions](#actions)
+ 	- [CMake](#cmake)
+ 	- [Matrix](#matrix)
+	- [Layers](#layers)
+	- [Activations](#activations)
+	- [Parameters](#parameters)
+	- [Gradients](#gradients)
+	- [Unit Tests](#unit-tests)
+	- [Python API](#python-api)
+	- [Benchmarking](#benchmarking)
+	- [Others](#others)
+- [Design Concerns](#design-concerns)
+
+## Overview
+
+我们会把MKL-DNN会作为第三方库集成进PaddlePaddle，与其他第三方库一样，会在编译PaddlePaddle的时候下载并编译MKL-DNN。
+
+同时，为了进一步提升PaddlePaddle在基本数学运算的计算速度，我们也将MKLML即(MKL small library\[[1](#references)\])
+作为另一个第三方库集成进PaddlePaddle，它只会包括生成好的动态库和头文件。
+
+MKL，MKLML以及MKL-DNN三者关系如下表：
+
+| Name        |  Open Source     | License     | Descriptions  |
+| :---------- | :--------------- | :---------- | :------------ |
+|   MKL       |     No           | Proprietary | Accelerate math processing routines | 
+|   MKLML     |     No           | Proprietary | Small package of MKL, especially for Machine Learning |
+|   MKL-DNN   |     Yes          | Apache 2.0  | Accelerate primitives processing routines especially for Deep Neural Networks  |
+
+MKLML可以与MKL-DNN共同使用，以此达到最好的性能。
+
+<div align="center">
+<img src="image/engine.png"><br/>
+Figure 2. PaddlePaddle with MKL Engines
+</div>
+
+## Actions
+
+添加的相关文件和目录结构如下：
+
+```txt
+PaddlePaddle/Paddle
+├── ...
+├── cmake/
+│   ├── external/
+│   │   ├── ...
+│   │   ├── mkldnn.cmake
+│   │   └── mklml.cmake
+└── paddle/
+    ├── ...
+    ├── math/
+    │   ├── ...
+    │   └── MKLDNNMatrix.*
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   └── MKLDNN*Layer.*
+        ├── activations/
+        │   ├── ...
+        │   └── MKLDNNActivations.*
+        └── tests/
+            ├── ...
+            ├── MKLDNNTester.*
+            └── test_MKLDNN.cpp
+```
+
+### CMake
+在`CMakeLists.txt`中提供一个与MKL有关的总开关：`WITH_MKL`，它负责决定编译时是否使用MKLML和MKL-DNN
+
+- `WITH_MKLML` 控制是否使用MKLML库。 
+当打开`WITH_MKL`时，会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。
+编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。
+MKLML的库目前都是动态库，主要包括`libiomp5.so`和`libmklml_intel.so`。
+- `WITH_MKLDNN` 控制是否使用MKL-DNN。
+当开启`WITH_MKL`时，会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。
+编译时会把对应的头文件和库放在`build/third_party/install/mkldnn/*`目录下对应的地方。
+MKL-DNN的库目前只有动态库`libmkldnn.so`。
+
+### Matrix
+目前在PaddlePaddle中数据都是以`NCHW`的格式存储，但是在MKL-DNN中的排列方式不止这一种。
+所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。
+
+<div align="center">
+<img src="image/matrix.png"><br/>
+Figure 3. MKLDNNMatrix
+</div>
+
+### Layers
+所有MKL-DNN的Layers都会继承于`MKLDNNLayer`，该类继承于PaddlePaddle的基类`Layer`。
+在`MKLDNNLayer`中会提供一些必要的接口和函数，并且会写好`forward`和`backward`的基本逻辑，
+子类只需要使用定义好的接口，实现具体的函数功能即可。
+
+<div align="center">
+<img src="image/layers.png"><br/>
+Figure 4. MKLDNNLayer
+</div>
+
+每个MKLDNNLayer都包含用于内部存储和外部存储的一系列MKLDNNMatrix：
+
+- 内部存储（internel memory）：`inVal_`,`inGrad_`,`outVal_`和`outGrad_`，分别代表输入数据，输入梯度，输出数据和输出梯度。
+- 外部存储（external memory）：都是以ext开头，比如`extInVal_`和`extInGrad_`，它们主要是用于，
+当数据格式与PaddlePaddle默认的`NCHW`格式不匹配时，转换内存的工作。
+需要注意的是，PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`，
+所以`extOutVal_`和`extOutGrad_`必须分别与`output_.value`和`output_.grad`共享内存，
+如果不需要外部存储用于转换，那么对应的内部存储也会与它们共享内存。
+- 转换函数（resetXXX）： 包括`resetInValue`，`resetInGrad`，`resetOutValue`和`resetOutGrad`，
+表示对输入数据，输入梯度，输出数据和输出梯度的转换。
+这些函数会根据输入参数重新设置内部和外部存储，当然这两者也可以相等，即表示不需要转换。
+
+注意：每个`MKLDNNlayer`的子类只需要使用内部存储就可以了，所有外部的转换工作都会在reset系列函数中都准备好。
+
+### Activations
+在重构前的PaddlePaddle中，激活函数是独立于`Layer`的概念，并且输入输出都是共用一块内存，
+所以添加了对应的`MKLDNNActivation`来实现，方式类似于`MKLDNNLayer`。
+
+### Parameters
+对于有参数的层，我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer共用一块内存。
+如果存在数据排列格式不一样的情况时，我们会在网络训练之前把格式转换为MKL-DNN希望的格式，
+在训练结束的时候再保存为PaddlePaddle的格式，但是整个训练过程中不需要任何转换。
+这样既使得最终保存的参数格式与PaddlePaddle一致，又可以避免不必要的转换。
+
+### Gradients
+由于MKL-DNN的操作都是直接覆盖的形式，也就是说输出的结果不会在原来的数据上累加，
+这样带来的好处就是不需要一直清空memory，节省了不必要的操作。
+但是注意的是，当网络出现分支且在`backward`的时候，需要累加不同Layer传过来的梯度。
+所以在`MKLDNNlayer`中实现了一个merge的方法，此时每个小分支的`Input Gradient`
+会先临时保存在`MKLDNNMatrix`中，由分支处的Layer负责求和，并把结果放到当前层的`output_.grad`中。
+所以整体上，在实现每个子类的时候就不需要关心分支的事情了。
+
+<div align="center">
+<img src="image/gradients.png"><br/>
+Figure 5. Merge Gradients
+</div>
+
+### Unit Tests
+我们会添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
+测试分为每个Layer（或Activation）的单元测试和简单网络的整体测试。
+每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果，小于某个比较小的阈值认为通过。
+
+### Python API
+目前只考虑**v1 API**。
+
+计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择，方便用户选择使用MKL-DNN的layers。
+
+具体实现方式比如：
+
+```python
+use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+if use_mkldnn
+    self.layer_type = mkldnn_*
+```
+
+所有MKL-DNN的`layer_type`会以*mkldnn_*开头，这些会在`MKLDNN*Layer`注册layer的时候保证，以示区分。 
+
+同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
+
+### Benchmarking
+会添加相应的脚本在[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/paddle/image)，用于测试和对比在使用MKL-DNN前后的CNN网络性能。
+测试的性能对比结果会在[IntelOptimizedPaddle.md](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md)
+
+### Others
+1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为4096，具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)。
+2. 深入PaddlePaddle，寻找有没有其他可以优化的可能，进一步优化。比如可能会用OpenMP改进SGD的更新性能。
+
+## Design Concerns
+
+为了更好的符合PaddlePaddle的代码风格\[[3](#references)\]，同时又尽可能少的牺牲MKL-DNN的性能\[[4](#references)\]。
+
+我们总结出一些特别需要注意的点：
+
+1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，
+我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
+2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
+3. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。
+包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
+4. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，
+同时数据格式就是`NCHW`，这样下一个cpu device就能拿到正确的数据。
+在有普通的CPU layer时， `extOutVal_`和`extOutGrad_`的格式始终是`NCHW`或者`NC`。
+
+## References
+1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。
+主要包括了深度学习相关的数学原语与操作，一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。
+2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)。
+目前在PaddlePaddle中，仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。
+3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。
+但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
+4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`，所以不存在这个问题)。
+所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
diff --git a/doc/design/mkl/mkldnn_fluid.md b/doc/design/mkl/mkldnn_fluid.md
new file mode 100644
index 0000000000000000000000000000000000000000..bef126f3f0577b69f646dfe5d10539b372c6a8a5
--- /dev/null
+++ b/doc/design/mkl/mkldnn_fluid.md
@@ -0,0 +1,149 @@
+# Design Doc: Add MKLDNN Kernel in Fluid Operator
+
+## Principles
+
+First of all, we should follow some basical principles like:
+1.  [How to write a new operator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md). We are trying to add a new kind of kernel into operators, so basically we should follow this doc.
+2.  [Supporting new Device/Library](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md). Since MKLDNN is a new library to fluid, we should add `MKLDNNDeviceContext` and maybe `mkldnn_helper.h`, just like [cudnn_helper.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/cudnn_helper.h).
+3.  [Switch Kernel](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). Another important point is that we should ensure the data synchronization between different kernel types, which is this [topic](https://github.com/PaddlePaddle/Paddle/issues/6549). So basically we should override `GetExpectedKernelType` and `trans` functions to support switching kernels.
+4.  [The Keys of Operator Kernel Type](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). Kernel Type is a pivotal conception which can record the `Place`, `Library`, `DataType` and `Layout`.
+
+## Sulution
+
+In general, there are four parts we should follow to run a MKL-DNN primitive.
+-  Create a primitive descriptor that describe this operator
+-  Create a primitive itself by primitive descriptor and the engine
+-  Create all memory buffers that primitive needed
+-  Launch a stream to execute the primitive created
+More details can refer to [here](http://01org.github.io/mkl-dnn).
+
+It's better to avoid reinitialization of primitives and memory handles in the first three stages in every iteration. \
+So we plan to create a map to record all the `primitive` and `memory`, which should not take too much memories as discussed [here](https://github.com/PaddlePaddle/Paddle/issues/6822).
+
+It's assumed that following three conditions should be satisfied.
+1. there is a unique key for each operator instance. May be the actual name of `Output Tensor`.
+2. the `Input Tensor` inside `Compute` function is the one after converted.
+3. we can get the phase(eg. `is_test`) inside `Compute` function, otherwise we need to expose this attribue to user.
+
+### Compute
+The algorithm of `Compute` would be described as follow, let's take conv like an example.
+
+```c++
+
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace.");
+  PADDLE_ENFORCE(platform::is_mkldnn_library(ctx.GetLibrary()), "It must use MKLDNN Library.");
+
+  auto& dev_ctx = ctx.template device_context<platform::MKLDNNDeviceContext>();
+
+  // find primitive by unique key from mkldnn context
+  // the op_key should be a unique name of this op instance
+  auto& p = dev_ctx.findPrimitive(op_key + "_fwd");
+
+  // assuming the input tensor inside this compute function is the one after converted
+  // this point should be guarantee by another mechanism
+  auto& i = dev_ctx.findMemory(op_key + "_input");
+  
+  if (p == nullptr || i == nullptr || inputSizeChanged(p, i))  {
+    auto fwd_primitive_desc = createPrimitiveDesc(ctx);
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+    shared_ptr<mkldnn::memory> in(new mkldnn::memory(fwd_primitive_desc->src_primitive_desc(), input->data<T>()));
+    shared_ptr<mkldnn::memory> wgt(new mkldnn::memory(fwd_primitive_desc->weights_primitive_desc(), filter->data<T>()));
+    shared_ptr<mkldnn::memory> out(new mkldnn::memory(fwd_primitive_desc->dst_primitive_desc(), output->mutable_data<T>(ctx.GetPlace())));
+    shared_ptr<mkldnn::conv_fwd> fwd_primitive(new mkldnn::conv_fwd(*fwd_primitive_desc, *in, *wgt, *out));
+
+    dev_ctx.addMemory(op_key+"_input", in);
+    dev_ctx.addMemory(op_key+"_output", out);
+    dev_ctx.addMemory(op_key+"_filer", wgt);
+    dev_ctx.addPrimitive(op_key+"_fwd", fwd_primitive);
+    dev_ctx.addPrimitiveDesc(op_key+"_fwd_PD", fwd_primitive_desc);
+  }
+
+  p = dev_ctx.findPrimitive(op_key + "_fwd");
+
+  PADDLE_ENFORCE(p, "Should have forward Primitive");
+  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_input"), "Should have input memory");
+  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_output"), "Should have output memory");
+  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_filter"), "Should have filter memory");
+  PADDLE_ENFORCE(dev_ctx.findPrimitiveDesc(op_unique_key+"_fwd_PD"), "Should have forward PrimitiveDesc");
+  dev_ctx.submit(p);
+  dev_ctx.execute();  // the convert primitive should have already contained.
+
+```
+
+The `createPrimitiveDesc` returns the primitive descripotor of this operator, would be like this:
+```c++
+  auto* input = ctx.Input<Tensor>("Input");
+  auto* filter = ctx.Input<Tensor>("Filter");
+  auto* output = ctx.Output<Tensor>("Output");
+  std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+  std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+  int groups = ctx.Attr<int>("groups");
+  algorithm algo = static_cast<algorithm>(ctx.Attr<int>("convolution_algorithm_option"));
+  prop_kind pk = ctx.Attr<bool>("is_test") ? prop_kind::forward_inference : prop_kind::forward_training;
+    
+  auto fwd_desc = mkldnn::conv_fwd::desc(/* all the setting above*/);
+  shared_ptr<mkldnn::conv_fwd::primitive_desc> fwd_primitive_desc(new mkldnn::conv_fwd::primitive_desc(fwd_desc, ctx.getEngine()));
+
+  return fwd_primitive_desc;
+  }
+```
+
+### MKLDNNDeviceContext
+`MKLDNNDeviceContext`, which is very straightforward, should contain some base information like: `stream`, `engine` and the map needed.
+
+
+### mkldnn_helper
+Some functions would be put in `paddle/platform/mkldnn_helper.h`.
+- create MKLDNN memories
+- create MKLDNN primitives
+- error check function
+- etc
+
+
+### Kernel Switch
+We should `reorder` the different Layout from other device or to other device. `GetExpectedKernelType` and `trans` functions can help us to implement it.
+
+`GetExpectedKernelType` should get the context, and this operator can return the best `KernelType`. 
+`trans` would be like this:
+
+```c++
+void trans(inputs, ctx) override {
+  if (NoNeedTrans()) {
+    return;
+  }
+  // find reorder primitive by op_key from context
+  auto& dev_ctx = ctx.template device_context<platform::MKLDNNDeviceContext>();
+  auto& p = dev_ctx.findPrimitive(op_key + "_reorder_input");
+  auto& i = dev_ctx.findMemory(op_key + "_src_input");
+
+  if (p == nullptr || i == nullptr || changeSized(i, input)) {
+    auto prim = createPrimitiveDesc(ctx);
+    auto src = createMemory(memoryDesc(input->dims(), actual_layout), input->data);
+    auto newbuffer = paddle::memory::Alloc(ctx.GetPlace(), input->size_in_bytes());
+    auto dst = createMemory(p->expected_desc(), newbuffer->data);
+    auto reorder_primitive(new mkldnn::reorder(src, dst));
+
+    dev_ctx.addMemory(op_key+"_src_input", src);
+    dev_ctx.addMemory(op_key+"_input", dst);
+    dev_ctx.addPrimitive(op_key+"_reorder_input", reorder_primitive);
+  }
+
+  p = dev_ctx.findPrimitive(op_key + "_reorder_input");
+  PADDLE_ENFORCE(p, "Should have Reorder Primitive");
+  dev_ctx.submit(p);
+  if (! this->isMKLDNNKernel()) {
+    // execute immediately only if this is not mkldnn kernel function.
+    // otherwise, it can be executed with the operator primitive in Compute
+    dev_ctx.stream();
+  }
+  // after submit, the input tensor in ExecutionContext should be changed as the converted one
+  // there should be another mechanism to ensure this
+}
+```
+
+### Unit Test
+All the functions should be tested corresponding.
+TBD
diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD
deleted file mode 100644
index ec6d4681836e189f46dbb9b915a237dc15cda7cf..0000000000000000000000000000000000000000
--- a/doc/design/mkldnn/README.MD
+++ /dev/null
@@ -1,120 +0,0 @@
-# Intel® MKL-DNN on PaddlePaddle: Design Doc
-
-我们计划将Intel深度神经网络数学库(**MKL-DNN**\[[1](#references)\])集成到PaddlePaddle，充分展现英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
-
-我们短期内的基本目标是：
-
-- 完成常用layer的MKL-DNN实现。
-- 完成常见深度神经网络VGG，GoogLeNet 和 ResNet的MKL-DNN实现。
-
-
-## Contents
-
-- [Overview](#overview)
-- [Actions](#actions)
- 	- [CMake](#cmake)
-	- [Layers](#layers)
-	- [Activations](#activations)
-	- [Weights](#weights)
-	- [Unit Tests](#unit-tests)
-	- [Protobuf Messages](#protobuf-messages)
-	- [Python API](#python-api)
-	- [Demos](#demos)
-	- [Benchmarking](#benchmarking)
-	- [Others](#others)
-- [Design Concerns](#design-concerns)
-
-## Overview
-
-我们会把MKL-DNN作为第三方库集成进PaddlePaddle，整体框架图
-<div align="center">
-<img src="image/overview.png" width=350><br/>
-Figure 1. PaddlePaddle on IA.
-</div>
-
-## Actions
-我们把集成方案大致分为了如下几个方面。
-
-### CMake
-我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关，他是负责`WITH_MKLML`和`WITH_MKLDNN`的总开关。
-
-当打开`WITH_MKL`时，会开启MKLML的功能，作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上，同时会开启MKL-DNN功能。
-
-当关闭`WITH_MKL`时，MKLML和MKL-DNN功能会同时关闭。
-
-所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
-
-### Layers
-所有MKL-DNN相关的C++ layers，都会按照PaddlePaddle的目录结构存放在
-`paddle/gserver/layers`中，并且文件名都会一以*MKLDNN*开头。
-
-所有MKL-DNN的layers都会继承于一个叫做`MKLDNNLayer`的父类，该父类继承于PaddlePaddle的基类`Layer`。
-
-在`MKLDNNLayer`中会提供一些必要的接口和函数，并且会写好`forward`和`backward`的基本逻辑。部分函数定义为纯虚函数，子类只需要实现这些函数即可。
-
-### Activations
-由于在PaddlePaddle中，激活函数是独立于layer概念的，所以会在`paddle/gserver/activations`目录下添加`MKLDNNActivation.h`和`MKLDNNActivation.cpp`文件用于定义和使用MKL-DNN的接口。
-
-### Weights
-由于有些layer是含有参数的，我们会尽量让MKL-DNN的参数与PaddlePaddle中`parameter`共享一块内存。
-同时，由于MKL-DNN在训练时使用的参数layout可能与PaddlePaddle默认的`nchw`不一致，我们会在网络训练的开始和结束时分别转换这个layout，使得最终保存的参数格式与PaddlePaddle一致。
-
-### Unit Tests
-会在`paddle/gserver/test`目录下添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
-测试分为每个layer(或activation)的单元测试和简单网络的整体测试。
-每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果，小于某个比较小的阈值认为通过。
-
-### Protobuf Messages
-根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
-
-### Python API
-目前只考虑**v1 API**。
-
-计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择，方便用户选择使用MKL-DNN的layers。
-
-具体实现方式比如：
-
-```python
-use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
-if use_mkldnn
-    self.layer_type = mkldnn_*
-```
-
-所有MKL-DNN的layer type会以*mkldnn_*开头，以示区分。 
-
-并且可能在`python/paddle/trainer_config_helper`目录下的`activations.py `和`layers.py`里面添加必要的MKL-DNN的接口。
-
-### Demos
-
-会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹，里面放入一些用于MKL-DNN测试的demo脚本。
-
-### Benchmarking
-会添加`benchmark/paddle/image/run_mkldnn.sh`，用于测试使用MKL-DNN之后的性能。
-
-### Others
-1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为64。
-2. 深入PaddlePaddle，寻找有没有其他可以优化的可能，进一步优化。比如可能会用OpenMP改进SGD的更新性能。
-
-## Design Concerns
-
-为了更好的符合PaddlePaddle的代码风格\[[2](#references)\]，同时又尽可能少的牺牲MKL-DNN的性能\[[3](#references)\]。
-
-我们总结出一些特别需要注意的点：
-
-1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
-2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
-3. 创建`MKLDNNMatrix`，同时继承`CpuMatrix`和`mkldnn::memory`。用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
-4. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
-5. 每个`MKLDNNlayer`都会有`inVal_`,`inGrad_`,`outVal_`和`outGrad_`，分别代表input value， input gradient，output value和output gradient。他们会存放MKL-DNN用到的internal memory。同时还会定义以*ext*开头的`MKLDNNMatrix`(表示external的memory)，主要是在格式与PaddlePaddle默认的`nchw`格式不匹配时，用于转换内存的工作。必要的转换函数也会在`MKLDNNLayer`中提前定义好，每个子类只需要调用定义好的reset buffer函数即可。
-6. 每个`MKLDNNlayer`的resetbuffer相关的函数（包括reset input、output的Value和grad），他们会根据输入参数reset internal和external的memory，当然这两者也可以相等，即表示不需要转换。只需要把握一个原则，每个`MKLDNNlayer`的子类，只需要使用internal的memory就可以了，所有external的转换工作在父类的reset函数中都提前准备好了。
-7. 一般来说，external的memory会尽量与PaddlePaddle中的`value`和`grad`共享内存。同时每个`MKLDNNLayer`中的external output value和gradient(也就是`extOutVal_`和`extOutGrad_`)必须分别与`output_.value`和`output_.grad`共享内存，因为PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`。如果不需要external的buffer用于转换，那么internal的buffer也会与他们共享内存。
-8. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，同时数据格式就是`nchw`，这样下一个cpu device就能拿到正确的数据。在有cpu device的时候，external的memory的格式始终是`nchw`或者`nc`。
-9. 由于MKL-DNN的输出操作都是覆盖data的，不是在原来的数据上累加，所以当网络出现分支时，在`backward`时会需要merge不同layer的梯度。`MKLDNNlayer`中会实现merge的方法，此时每个小分支的input gradient会先临时保存在一个`MKLDNNMatrix`中，由分支处的layer负责求和，并把结果放到这个layer的`output_.grad`中。所以整体上，每个子类并不会需要关心分支的事情，也是在父类都实现好了。
-10. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
-
-## References
-
-1. [Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)](https://github.com/01org/mkl-dnn "Intel MKL-DNN")
-2. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
-3. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的CUDNN部分使用的也是`NCHW`，所以不存在这个问题)，所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
-
diff --git a/doc/design/mkldnn/image/overview.png b/doc/design/mkldnn/image/overview.png
deleted file mode 100644
index 84b455c28230703599a2529f014cfbb222138fef..0000000000000000000000000000000000000000
Binary files a/doc/design/mkldnn/image/overview.png and /dev/null differ
diff --git a/doc/design/operator_kernel_type.md b/doc/design/operator_kernel_type.md
new file mode 100644
index 0000000000000000000000000000000000000000..f86e6b7a564ed23f2bddbec25da1c110014f941d
--- /dev/null
+++ b/doc/design/operator_kernel_type.md
@@ -0,0 +1,91 @@
+# Design Doc: The Keys of Operator Kernel Type
+## Problem
+An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique kernel. Before an operator runs, a certain type of kernel must be chosen via a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  proto::DataType data_type_;
+};
+```
+For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
+
+It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys do not provide enough information. We need a more complete representation of `OpKernelType`.
+
+We often implement a kernel of an operator with some computing library on certain device(place). Please note that computing library and device do not have a one-to-one correspondence. A device can have a lot of computing libraries and a computing library can also support different devices.
+
+For example, Eigen library supports Nvidia GPU/AMD GPU/CPU and MKLDNN library supports Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
+
+Different DataTypes, such as fp64/fp32/int8, will obviously have different kernels. But different data layout of a Tensor will also lead to different implementations. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209) as an example. Data layout should also be taken into consideration.
+
+## Solution
+
+There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  platform::Library library_;
+  proto::DataType data_type_;
+  framework::Layout layout_;
+};
+```
+
+The details are as follows:
+
+### Place
+
+`Place` is defined as:
+
+```cpp
+typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
+```
+
+`Place` represents the device memory where data is located.
+
+
+### Library
+
+One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
+
+```cpp
+enum Library { Plain, MKLDNN, CUDNN };
+```
+
+We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on the `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
+A library usually has a corresponding `DeviceContext` which contains some handles needed for computation. Fluid now has two default DeviceContexts for CPU and CUDA, namely, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains an Eigen library handle and `CDUADeviceContext` contains an Eigen library handle and a cuBLAS handle.
+
+If we want to support new library, a new enumerator need to be added to `Library` and a corresponding new `LibraryDeviceContext` need to be created.
+
+
+### DataType
+
+
+`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
+
+### Layout
+
+Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
+
+Different layout leads to different implementation of the operator kernel. There are mainly 4 principles we have to follow to support layout in our Fluid framework.
+
+- We take layout as a data member of Tensor. Layout is actually a enum variable. If Fluid is built with MKLDNN, then the memory format in MKLDNN will also be added into this enum variable.
+
+- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout for generating data. Of course, we can have some default layout, like NCHW.
+
+- The inference of Layout is at run-time, not at compile-time.
+
+- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to  register kernels for MKLDNN operators.
+
+`Layout` is also defined as a enum variable:
+
+```cpp
+enum Layout {
+  kNCHW,
+  kNHWC,
+#ifdef PADDLE_WITH_MKLDNN
+  knChw8c
+  ...
+#endif
+};
+```
diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md
index 202b4b65103c0b7c536a9cb466c4120ce134d8c3..691081c268b848811bf5ee6d6a41edfe0f47eec0 100644
--- a/doc/design/optimizer.md
+++ b/doc/design/optimizer.md
@@ -79,7 +79,7 @@ class Optimizer(object):
     def minimize(self, loss, parameter_list):
         """Add operations to minimize `loss` by updating `parameter_list`.
 
-        This method combines interface `append_backward_ops()` and
+        This method combines interface `append_backward()` and
         `create_optimization_pass()` into one.
         """
         params_grads = self.create_backward_pass(loss, parameter_list)
diff --git a/doc/design/paddle_nccl.md b/doc/design/paddle_nccl.md
new file mode 100644
index 0000000000000000000000000000000000000000..c7dac70998a6cfec3a6d2fc72b698ff9722e6805
--- /dev/null
+++ b/doc/design/paddle_nccl.md
@@ -0,0 +1,65 @@
+# Design Doc: NCCL support in Paddle Fluid
+
+## Abstract
+
+This Design Doc refers to the NCCL feature in  paddle.  We propose an approach to support NCCL library both on a single machine and multiple machines. We wrapper the NCCL primitives `Broadcast`, `Allreduce`, `Reduce` as operators to utilize Multi-GPU powers in one script.
+
+
+## Motivation
+
+[NCCL](https://developer.nvidia.com/nccl) is a NVIDIA library support Multi-GPU communicating and optimized for NVIDIA GPUs, it provides routines such as all-gather, all-reduce, broadcast, reduce, reduce-scatter, that can achieve high bandwidth over PCIe and NVLink high-speed interconnect. With NCCL library, we can easily accelerate the training in parallel. 
+
+- Pros
+1. easily plug-in with [NCCL2](https://developer.nvidia.com/nccl) library.
+1. high performance in NVIDIA GPUs.
+1. MPI like primitives, which have low learning cost for users.
+
+- Cons
+1. Only design for NVIDIA GPUs, not a general multi-device solution.
+1. Although NCCL1 is opensourced under BSD license, but NCCL2 is not opensourced anymore.
+
+At the beginning of training, the framework needs to distribute the same parameters to every GPU, and merge the gradients at any time user interests.
+
+As a result, during training, we need the operations of peer to peer copy between different GPUs, aggregating gradients/parameters from GPUs, and broadcasting parameters to GPUs. Every GPU only need to run the operator with correct place information.
+
+Besides, it needs interfaces to synchronize model update with each different GPU Cards. 
+
+## Implementation
+
+As mentioned above, we wrap the NCCL routines as several kinds of operators. Need to note that NCCL need to create Communicator between gpu at the beginning, so there is a NCCLInit operator created.
+
+### Transpiler
+
+To be compatible with [parameter server design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md), the transpiler compiles the user defined operation graph into sub-graphs to be executed on different devices.
+
+1. The user-defined model will be a single device program
+
+2. Broadcast/Reduce operators between GPUs will be inserted into the program, even for the multi-node, may insert the `Send`, `Recv` operator.
+
+   *Broadcast, AllReduce in a single machine. And Broadcast, AllReduce, [Send, Recv](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md#graph-converter) in multiple machines*
+
+   <img src="images/multigpu_before_convert.png" width="300"/>
+
+After compiling, the graph as shows
+
+<img src="images/multigpu_allreduce.png" width="1000"/>
+
+Operators are added to the sub-graphs. Every GPU assigned a role of `rank0`, `rank1` etc. 
+
+- **Broadcast**. Broadcast operator distribute initialized parameter to all the GPUs from the GPU who owns it. e.g. from`rank0` GPU.
+- **AllReduce**. AllReduce operator synchronizes parameters/gradients between GPUs. AllReduce implemented in the Ring-Based  communicating method, avoid of the bottle neck in a single GPU.
+
+Need to notice that AllReduce operator force GPUs synchronized at that point. The whole training process in asynchronous or synchronous mode depends on the AllReduce point in the graph.
+
+As it shown in the picture, when each GPU compute the gradient of `W`, followed with a `AllReduce` operator, accumulate the `dW` to full batch of data, then run the optimize process individually and apply the gradient to its `W`.
+
+- **AllReduce**
+  Need to note that our AllReduce operator is a ring-base AllReduce implementation. If we use the NCCL2 AllReduce primitive, every GPU optimized full batch of data, wasted (n-1) GPU compute resources. In addition, NCCL2 built-in AllReduce will only utilize the communicating resource during synchronization, then update the gradient will be a subsequent phase. In fact, we can amortize the update gradient time cost into the communicating phase. The process is
+1. Every parameter has its root card. That card will responsible for aggregating the gradients from GPUs.
+2. The whole model's parameter will be hashed to different root card, ensure the load balance between GPUs.
+3. Logically neighberhood card will start send parameter to the next one. After one round, the parameter main card will aggregate the full gradients.
+4. Then the root card will optimize the parameter.
+5. This parameter card will send its optimized result to its neighberhood, then the neighberhood will send parameter to its next one.
+6. Finish the sychronization round.
+
+The total time cost will be 2 * (n-1) * per-parameter-send-time, we reach the goal of amortize the upgrade time into communicating phase.
diff --git a/doc/design/profiler.md b/doc/design/profiler.md
new file mode 100644
index 0000000000000000000000000000000000000000..b20b5efdc1f1f10ce7cec835adcc6fb374ed4e20
--- /dev/null
+++ b/doc/design/profiler.md
@@ -0,0 +1,97 @@
+## Introduction
+
+There are many performance analysis tools for [different programming languages and different software frameworks](https://en.wikipedia.org/wiki/List_of_performance_analysis_tools). For most popular deep learning frameworks, they use several programming languages and adapt to heterogeneous platforms. Similar to most of the deep learning frameworks, PaddlePaddle also uses C++, CUDA and Python as the basic programming languages to adapt to run on CPU and GPU devices.  The [`nvprof` tools](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) is usually used to analyse the CUDA program.  We have [a document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/optimization/cpu_profiling.md) to profile CPU and Python program by [yep](https://pypi.python.org/pypi/yep) and [Google's perftools](https://github.com/google/pprof) to profile only the CPU and Python program. But for [PaddlePaddle fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), the operator is the basic computing unit. The developers usually want to collect the time of each operator and locate bottlenecks.  The `nvprof` usually collect the timeline of CUDA-related activities on both CPU and GPU, including kernel execution, memory transfers, memory set and CUDA API calls and events or metrics for CUDA kernels. And the `yep` and `Google's perftools` can't collect the timeline for CUDA program. All these tools can't collect time in the operator level. So we design this profiling tool.
+
+## Architecture
+
+The work flow for most task is as follows. Each operator will run many times in the all iterations. So the profiler must collect the total time of each operator during the iteration. For more, sometimes, the developers may want to collect more detailed time span inside the operator or record time span for elsewhere, this requires that the profiler must support to record the nested time span. And in order to speedup training, all the deep learning frameworks support parallel computing, including multiple threads on CPU and multiple GPUs. So the profiler must be able to collect the timeline for each thread. In addition, the profiler also occupies certain resources. It must can be easily to be enabled or disabled by the developers. At last, the profiler should present a human-readable report.  
+
+```python
+for i in xrange(M):  # M is  the iteration number
+  for op in operator_lists: # The `operator_lists` contains all the operators in the network.
+    op.run();
+```
+
+In summary, the proflier should have following features:
+
+- records time span in loop.
+- supports nested time span.
+- supports multiple threads/multiple GPUs.
+- supports to be enabled and disabled by users.
+
+But how to record the time for the mixed C++ and CUDA program?  There many C++ APIs to get the current calendar time in host program. But for GPU, the CUDA kernels may be executed concurrently if they are in different [streams](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams) and the CUDA kernels is asynchronous with the host program if there is no the synchronous aftern the CUDA kernels. CUDA provides [event](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#events) to monitor the device and perform accurate timing. Inspired by PyTorch and CUDA event, we also design and apply the events to record the timeline. Then summarize and present statistics based on these events.  
+
+The overall flow is shown as the following figure.
+
+<img src="./images/profiler.png" align="center"/><br/>
+
+### Event
+
+In above work flow, a pair of events are needed before and after the piece of code to collect time. So the event has a flag to mark whether it is a starting event or an ending event. Except this two kinds of event, sometime, a only marker with a text message is needed, for example, a marker to specify the profiling start or end. There are three kinds of event:
+
+```c++
+enum EventKind {
+  kMark,
+  kPushRange,
+  kPopRange};
+```
+- kMark: only a marker without time range.
+- kPushRange: mark the starting event for time range. 
+- kPopRange: mark the ending event for time range.
+
+For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used.  For many pieces of code, an event lists are used to record each piece. 
+
+```c++
+class Event {
+ public:
+  // The DeviceContext is used to get current  CUDA stream.
+  Event(EventKind kind, std::string name, uint32_t thread_id,
+        const platform::DeviceContext* dev_ctx = nullptr);
+  double CpuElapsedUs(const Event& e) const;
+  double CudaElapsedUs(const Event& e) const;
+
+ private:
+  EventKind kind_;
+  std::string name_;
+  uint32_t thread_id_;
+  int64_t cpu_ns_;
+#ifdef PADDLE_WITH_CUDA
+  cudaEvent_t event_ = nullptr;
+  int device_ = -1;
+#endif
+};
+
+struct EventList {
+  std::forward_list<std::vector<Event>> event_blocks;
+};
+```
+
+As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler. 
+
+```c++
+enum ProfilerState {
+  kDisabled, 
+  kCPU,
+  kCUDA
+};
+ProfilerState g_state;
+```
+- kDisabled: the disabled state.
+- kCPU: CPU profiling state.
+- kCUDA: GPU profiling state.
+
+A pair of starting and ending events are pushed to event lists in constructor and destructor of `RecordEvent`. So the timeline is recorded for the code in the lifecycle of an object of `RecordEvent`.
+
+```c++
+struct RecordEvent {
+  explicit RecordEvent(const std::string name,
+                       platform::DeviceContext* dev_ctx = nullptr) {
+    if (kState == ProfilerState::kDisabled) return;
+    // push the starting event to the event lists.
+  }
+  ~RecordEvent() {
+    if (kState == ProfilerState::kDisabled) return;
+    // push the ending event to the event lists.
+  }
+};
+```
diff --git a/doc/design/python_api.md b/doc/design/python_api.md
index cb5fdc765b7126fc66a1c8978d4b96c0dc5a9f2c..73f6d7b90c7dca0d48109cf3d28d5f7cd56b5c0b 100644
--- a/doc/design/python_api.md
+++ b/doc/design/python_api.md
@@ -279,6 +279,26 @@ class LayerHelper(object):
     return tmp
 ```
 
+### Return value of layer functions
+
+The layer will return a Variable, which is also the output of an operator.  However, outputs of a layer function have more attributes than an operator. There are parameter variables, and their gradient variables need to return. To return them is useful. For example,
+
+1. Users can debug the network by printing parameter gradients.
+2. Users can append attributes to a parameter, such as, `param.stop_gradient=True` will make a parameter stop generate the gradient. We can fix the parameter value during training by using this attribute.
+
+However, it is good to return a Variable for layers, since all layers and operators use Variables as their parameters. We can just append a `param` field and a `grad` field for layer function since the Python is dynamic typing.
+
+The sample usage is
+
+```python
+data = fluid.layers.data(...)
+hidden = fluid.layers.fc(data, ...)
+...
+
+executor.run(fetch_list=[hidden.param, hidden.param.grad], ...)
+```
+
+
 ## Optimizer
 
 [Optimizer Design Doc](./optimizer.md)
diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md
index 320dccec3ddc7bfe6042f4e65b2518ea7b1ad24a..2cd4b6225b61cf374458e40afabad7745f61ba71 100644
--- a/doc/design/reader/README.md
+++ b/doc/design/reader/README.md
@@ -1,25 +1,25 @@
 # Python Data Reader Design Doc
 
-At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that
+During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following:
 
-- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items.
-- A *reader creator* is a function that returns a reader function.
-- A *reader decorator* is a function, which accepts one or more readers, and returns a reader.
-- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
+- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items.
+- A *reader creator*: A function that returns a reader function.
+- A *reader decorator*: A function, which takes in one or more readers, and returns a reader.
+- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
 
-and provide function which converts reader to batch reader, frequently used reader creators and reader decorators.
+and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators.
 
 ## Data Reader Interface
 
-Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`):
+*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows:
 
 ```
 iterable = data_reader()
 ```
 
-Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int)
+The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.)
 
-An example implementation for single item data reader creator:
+An example implementation for single item data reader creator is as follows:
 
 ```python
 def reader_creator_random_image(width, height):
@@ -29,7 +29,7 @@ def reader_creator_random_image(width, height):
     return reader
 ```
 
-An example implementation for multiple item data reader creator:
+An example implementation for multiple item data reader creator is as follows:
 ```python
 def reader_creator_random_image_and_label(width, height, label):
     def reader():
@@ -40,9 +40,10 @@ def reader_creator_random_image_and_label(width, height, label):
 
 ## Batch Reader Interface
 
-*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple.
+*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple.
+
+Here are some valid outputs:
 
-Here are valid outputs:
 ```python
 # a mini batch of three data items. Each data item consist three columns of data, each of which is 1.
 [(1, 1, 1),
@@ -58,20 +59,22 @@ Here are valid outputs:
 Please note that each item inside the list must be a tuple, below is an invalid output:
 ```python
  # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],).
- # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
- # or three column of datas, each of which is 1.
+ # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
+ # or three columns of data, each of which is 1.
 [[1,1,1],
 [2,2,2],
 [3,3,3]]
 ```
 
-It's easy to convert from reader to batch reader:
+It is easy to convert from a reader to a batch reader:
+
 ```python
 mnist_train = paddle.dataset.mnist.train()
 mnist_train_batch_reader = paddle.batch(mnist_train, 128)
 ```
 
-Also easy to create custom batch reader:
+It is also straight forward to create a custom batch reader:
+
 ```python
 def custom_batch_reader():
     while True:
@@ -85,7 +88,8 @@ mnist_random_image_batch_reader = custom_batch_reader
 
 ## Usage
 
-batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`:
+Following is how we can use the reader with PaddlePaddle:
+The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows:
 
 ```python
 # two data layer is created:
@@ -99,13 +103,13 @@ paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...)
 
 ## Data Reader Decorator
 
-*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax.
+The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax.
 
-Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples:
+Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples:
 
 ### Prefetch Data
 
-Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data.
+Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data.
 
 Use `paddle.reader.buffered` to prefetch data:
 
@@ -117,9 +121,9 @@ buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100)
 
 ### Compose Multiple Data Readers
 
-For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
 
-We can do:
+We can do the following :
 
 ```python
 def reader_creator_random_image(width, height):
@@ -139,13 +143,13 @@ false_reader = reader_creator_bool(False)
 
 reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader)
 # Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry.
-# And we don't care second item at this time.
+# And we don't care about the second item at this time.
 paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
 ```
 
 ### Shuffle
 
-Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read.
+Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read.
 
 Example:
 ```python
@@ -154,21 +158,21 @@ reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512)
 
 ## Q & A
 
-### Why reader return only a single entry, but not a mini batch?
+### Why does a reader return only a single entry, and not a mini batch?
 
-Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2).
+Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2).
 
-We provide function `paddle.batch` to turn (single entry) reader into batch reader.
+We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader.
 
-### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient?
+### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ?
 
-In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically.
+In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful.
 
-### Why use a dictionary but not a list to provide mapping?
+### Why use a dictionary instead of a list to provide mapping?
 
-We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`).
+Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`).
 
-### How to create custom data reader creator
+### How to create a custom data reader creator ?
 
 ```python
 def image_reader_creator(image_path, label_path, n):
@@ -192,7 +196,7 @@ paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
 
 ### How is `paddle.train` implemented
 
-An example implementation of paddle.train could be:
+An example implementation of paddle.train is:
 
 ```python
 def train(batch_reader, mapping, batch_size, total_pass):
diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/refactor/distributed_architecture.md
deleted file mode 100644
index ac7e98ccf1aadbb973a4801fde842375cf63448c..0000000000000000000000000000000000000000
--- a/doc/design/refactor/distributed_architecture.md
+++ /dev/null
@@ -1,222 +0,0 @@
-# Design Doc: Distributed Training Architecture
-
-## Abstract
-
-PaddlePaddle v0.10.0 uses the "trainer-parameter server"
-architecture. We run multiple replicated instances of trainers (runs
-the same code written by the user) and parameter servers for
-distributed training. This architecture served us well, but has some
-limitations:
-
-1. Need to write special code to handle tasks which should only be run
-  by a single trainer. E.g., initializing model and saving model.
-
-2. Model parallelism is hard: need to write if-else branches conditioned
-  on the trainer ID to partition model onto each trainer, and manually
-  write the inter-model-shard communication code.
-
-3. The user can not directly specify the parameter update rule: need
-   to modify the parameter server C++ code and compile a new
-   binary. This adds complication for researchers: A lot of extra
-   effort is required. Besides, the training job submission program
-   may not allow running arbitrary binaries.
-
-This design doc discusses PaddlePaddle's new distributed training
-architecture that addresses the above limitations.
-
-## Analysis
-
-We will assume the user writes the trainer program by Python, the same
-analysis holds if the trainer program is written in C++.
-
-### Limitation 1
-
-If we look at the Python code that the user writes, there are two
-kinds of functionalities:
-
-- The training logic such as load / save model and print log.
-- The neural network definition such as the definition of the data
-  layer, the fully connected layer, the cost function and the
-  optimizer.
-
-When we training with PaddlePaddle v0.10.0 distributedly, multiple
-replicated Python instances are running on different nodes: both the
-training logic and the neural network computation is replicated.
-
-The tasks that should only run once all belong to the training logic,
-if we only replicate the neural network computation, but do **not**
-replicate the training logic, the limitation could be solved.
-
-### Limitation 2
-
-Model parallelism means running a single model on multiple nodes by
-partitioning the model onto different nodes and managing the
-inter-model-shard communications.
-
-PaddlePaddle should be able to modify the nerual network computation
-definition to support model parallelism automatically. However, the
-computation is only specified in Python code, and PaddlePaddle can not
-modify Python code.
-
-Just like compiler uses a intermediate representation (IR) so that
-programmer does not need to manually optimize their code in most of
-the cases - the compiler will optimize the IR:
-
-<img src="src/compiler.png"/>
-
-We can have our own IR too: PaddlePaddle can support model parallel by
-converting the IR so the user no longer need to manually do it in
-Python:
-
-<img src="src/paddle-compile.png"/>
-
-The IR for PaddlePaddle after refactor is called `Block`, it specifies
-the computation dependency graph and the variables used in the
-computation.
-
-### Limitation 3
-
-The user can not directly specify the parameter update rule for the
-parameter server because the parameter server does not use the same
-computation definition as the trainer. Instead, the update rule is
-baked in the parameter server. The user can not specify the update
-rule in the same way of specifying the trainer computation.
-
-This could be fixed by making the parameter server run the same
-computation definition as the trainer. For a detailed explanation,
-please
-see
-[Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
-
-## Distributed Training Architecture
-
-The new distributed training architecture can address the above
-limitations. Below is the illustration:
-
-<img src="src/distributed_architecture.png"/>
-
-The architecture includes major components: *PaddlePaddle Python*,
-*PaddlePaddle converter* and *PaddlePaddle runtime*:
-
-### PaddlePaddle Python
-
-PaddlePaddle Python is the Python library that user's Python trainer
-invoke to build the neural network topology, start training, etc.
-
-```Python
-paddle.init()
-input = paddle.op.recordIO("/home/data/mnist.recordio") # file stored on the cluster
-img, label = input[0], input[1]
-hidden = paddle.layer.fc(input=img, size=200, act=paddle.activation.Tanh())
-prediction = paddle.layer.fc(input=img, size=10, act=paddle.activation.Softmax())
-cost = paddle.layer.classification_cost(input=prediction, label=label)
-optimizer = paddle.optimizer.SGD(cost, learning_rate=0.01)
-session = paddle.session.NewRemote(num_trainer=3, num_ps=2, GPU_per_trainer=1)
-for i in range(1000):
-	_, cost_val = session.eval(targets=[cost, optimizer])
-	print cost_val
-```
-
-The code above is a typical Python trainer code, the neural network
-topology is built using helper functions such as
-`paddle.layer.fc`. The training is done by calling `session.eval`
-iteratively.
-
-#### session.eval
-
-As shown in the graph, `session.eval` sends the IR and the evaluation
-inputs/targets to the PaddlePaddle cluster for evaluation. The
-targets can be any variable in the computation graph. When the target
-is the `optimizer` variable, the neural network will be optimized
-once. When the target is the `cost` variable, `session.eval` returns
-the cost value.
-
-The Python `session` is a wrapper of the C++ `Session` class. For more
-information about `Session`, please
-see [Design Doc: Session](./session.md).
-
-### PaddlePaddle Converter
-
-PaddlePaddle converter automatically converts the IR in the request
-(IR and evaluation inputs/targets) from PaddlePaddle Python to new
-partitioned IRs and dispatch the new IRs and evaluation inputs/targets
-to different PaddlePaddle runtimes. Below are the steps:
-
-1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that
-   fetches the eval targets to the IR.
-
-1. Extract a new computation (sub)graph with `feed` and `fetch` OP as
-   the boundary. The runtime does not need to run the OP that is not
-   dependent by the `fetch` OP.
-
-1. Optimizes the computation graph.
-
-1. Place the OPs in the graph onto different devices on different
-   PaddlePaddle runtime according to a placement algorithm and device
-   constraint specified by the user.
-
-1. Partition the graph according to runtime boundaries and add `send` /
-   `recv` OP pair on the runtime boundaries.
-
-1. Dispatch the partitioned graph to different PaddlePaddle runtimes.
-
-1. PaddlePaddle runtimes with the `fetch` OP reports evaluation
-   results back to the converter, the convert reports the evaluation
-   results back to the PaddlePaddle Python.
-   
-The output IRs will be cached to optimize the conversion latency.
-
-
-#### Placement Algorithm
-
-Our first implementation will only support "trainer-parameter server"
-placement: the parameters, initializers, and optimizers are placed on
-the PaddlePaddle runtimes with the parameter server role. And
-everything else will be placed on the PaddlePaddle runtimes with the
-trainer role. This has the same functionality of our
-"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
-is more general and flexible.
-
-In the future, we will implement the general placement algorithm,
-which makes placements according to the input IR, and a model of
-device computation time and device communication time. Model
-parallelism requires the general placement algorithm.
-
-
-### PaddlePaddle Runtime
-
-The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and
-runs the IR. The runtime does not need to do OP placement since it's
-already done by the converter.
-
-
-### Local Training Architecture
-
-The local training architecture will be the same as the distributed
-training architecture, the differences are everything runs locally,
-and there is just one PaddlePaddle runtime:
-
-<img src="src/local_architecture.png"/>
-
-
-### Training Data
-
-In PaddlePaddle v0.10.0, training data is typically read
-with [data reader](../reader/README.md) from Python. This approach is
-no longer efficient when training distributedly since the Python
-process no longer runs on the same node with the trainer processes,
-the Python reader will need to read from the distributed filesystem
-(assuming it has the access) and send to the trainers, doubling the
-network traffic.
-
-When doing distributed training, the user can still use Python data
-reader: the training data are sent with `session.eval`. However should
-be used for debugging purpose only. The users are encouraged to use
-the read data OPs.
-
-
-## References:
-
-[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
-
-[2] [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf)
diff --git a/doc/design/refactor/session.md b/doc/design/refactor/session.md
deleted file mode 100644
index 1d9a26683c14f54e3b5fe41675cd03b5620646b8..0000000000000000000000000000000000000000
--- a/doc/design/refactor/session.md
+++ /dev/null
@@ -1,180 +0,0 @@
-# Design Doc: Session
-
-## Abstract
-
-The *session* object encapsulates the environment in which the
-computation graph is executed.
-
-We will have the *local* session and *remote* session, they offer the
-same [interface](#interface). The local session encapsulates the local
-runtime environment and the remote session encapsulates the cluster
-runtime environment.
-
-The local runtime environment contains:
-
-1. computation devices (i.e., CPU, GPU) handles, and
-1. the [scope](../scope.md) which holds all variables.
-
-The remote runtime environment contains:
-
-1. computation devices (i.e., CPU and GPU on node 0, 1) in a cluster,
-   and
-1. the distributed [scope](../scope.md) in a cluster which holds all
-   variables.
-
-The user can create a remote session on Paddle Cloud and evaluate the
-computation graph with it. In this way, the user can control the
-remote computation resource in a cluster from his local computer.
-
-
-## Background
-
-The current design has an implicit global session in which
-`paddle.eval()` is executed. The pain point is:
-
-Since the user is not able to explicitly switch between runtime
-environments, the user cannot run a topology in two independent
-environments.
-
-For example, in reinforcement learning, the user may want to have a
-stale model for inference and a fresh model for training, and only
-replace the stale model with the fresh model periodically.
-
-Furthermore, we have no concept that encapsulates a remote environment
-that executes a computation graph.
-
-We need the session object to address above issues.
-
-
-## Session
-
-A session is an object that owns the runtime environment. All
-computations are executed through `session.eval()`.
-
-
-### Interface
-
-```python
-eval(
-    targets,
-    feed_dict=None,
-)
-```
-
-Evaluates the target Operations or Variables in `targets`.
-
-- *targets*: the evaluation targets. Can be a single Operation or
-  Variable, or a list with the Operations or Variables as
-  elements. The value returned by `eval()` has the same shape as the
-  `target` argument.
-
-  The PaddlePaddle program is represented by
-  the [ProgramDesc](../design/program.md), `eval()` will infer the
-  ProgramDesc from the given targets and run the PaddlePaddle
-  program. Please
-  see
-  [this graph](./distributed_architecture.md#local-training-architecture) for
-  the detailed illustration for the local session
-  and
-  [this graph](./distributed_architecture.md#distributed-training-architecture) for
-  the detailed illustration for the remote session.
-
-- *feed_dict*: a dictionary that contains the tensors which override
-  the edges of the computation graph.
-
-  feed_dict not only can provide the input data, it can override any
-  OP's input as well:
-
-  ```python
-  a = pd.constant(2.0, name="a")
-  b = pd.variable(name="b")
-  c = pd.mul(a,b)
-  sess.eval(targets=c, feed_dict={"b":3.0}) # returns 6.0
-  ```
-
-```python
-close()
-```
-
-Closes the session and releases the scope that the session owns.
-
-
-### Create a Local Session
-
-```python
-session(
-    devices=None
-)
-```
-
-Creates a new session. One session owns one global scope, so creating
-multiple sessions will create different scopes.
-
-- *devices*: a single `string` or a list of `string` of device names,
-  the corresponding devices will be the computation devices for
-  `eval()`. If not specified, all available devices (e.g., all GPUs)
-  will be used. The user doesn't need to specify the CPU device since
-  it will be always used. Multiple sessions can use the same device.
-
-
-#### Example
-
-```Python
-a = paddle.constant(1.0)
-b = paddle.constant(2.0)
-c = a + b
-sess = paddle.session(devices=["gpu:0", "gpu:1", "fpga:0"])
-sess.eval(c)
-sess.close()
-```
-
-### Create a Remote Session
-
-```python
-create_cloud_job(
-    name,
-    num_trainer,
-    mem_per_trainer,
-    gpu_per_trainer,
-    cpu_per_trainer,
-    num_ps,
-    mem_per_ps,
-    cpu_per_ps,
-)
-```
-
-Creates a Paddle Cloud job. Fails if the job name exists.
-
-```python
-get_cloud_job(
-    name
-)
-```
-
-Gets a Paddle Cloud job.
-
-```python
-remote_session(
-    job
-)
-```
-
-- *job*: the Paddle Cloud job.
-
-#### Example
-
-```Python
-reader = paddle.reader.recordio("/pfs/home/peter/mnist-train-*") # data stored on Paddle Cloud
-image = reader.column(0)
-label = reader.column(1)
-fc1 = paddle.op.fc(image, size=256, act="sigmoid")
-fc2 = paddle.op.fc(fc1, size=10, act="softmax")
-cost = paddle.op.cross_entropy(fc2, label)
-opt = paddle.optimizer.sgd(cost)
-
-job = paddle.create_cloud_job("test", 3, "1G", 1, 1, 2, "1G", 1)
-sess = paddle.remote_ession(job)
-for i in range(1000):
-    sess.eval(opt)
-sess.close()
-```
diff --git a/doc/design/refactor/src/distributed_architecture.graffle b/doc/design/refactor/src/distributed_architecture.graffle
deleted file mode 100644
index f8496e57326c38de7468eb452a7713291d57653c..0000000000000000000000000000000000000000
Binary files a/doc/design/refactor/src/distributed_architecture.graffle and /dev/null differ
diff --git a/doc/design/refactor/src/distributed_architecture.png b/doc/design/refactor/src/distributed_architecture.png
deleted file mode 100644
index 410c4510c6aab301dec95e6427fe80ac24e105fe..0000000000000000000000000000000000000000
Binary files a/doc/design/refactor/src/distributed_architecture.png and /dev/null differ
diff --git a/doc/design/refactor/src/local_architecture.graffle b/doc/design/refactor/src/local_architecture.graffle
deleted file mode 100644
index cc7783c45381f25ded0b898649322c81418ad317..0000000000000000000000000000000000000000
Binary files a/doc/design/refactor/src/local_architecture.graffle and /dev/null differ
diff --git a/doc/design/refactor/src/local_architecture.png b/doc/design/refactor/src/local_architecture.png
deleted file mode 100644
index 4b999538b7825c805292ee28b5e3256d5543bd09..0000000000000000000000000000000000000000
Binary files a/doc/design/refactor/src/local_architecture.png and /dev/null differ
diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md
index 62ff8f3229bbbb5bc82e4da29259baffc30c2c87..b9787261092f1f27377886152cb1596d9ff54188 100644
--- a/doc/design/releasing_process.md
+++ b/doc/design/releasing_process.md
@@ -5,12 +5,11 @@ PaddlePaddle使用git-flow branching model做分支管理，使用[Semantic Vers
 PaddlePaddle每次发新的版本，遵循以下流程:
 
 1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
-2. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
-3. 对这个版本的提交，做如下几个操作:
-	* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
-	* 编译这个版本的Ubuntu Deb包。如果失败，修复Ubuntu Deb包编译问题，Patch号加一，返回第二步。
-	* 使用Regression Test List作为检查列表，测试Docker镜像/ubuntu安装包的功能正确性
-		* 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，返回第二步
+1. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
+1. 对这个版本的提交，做如下几个操作:
+  * 使用Regression Test List作为检查列表，测试本次release的正确性。
+	  * 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，到第二步
+	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
 	* 编译这个版本的python wheel包，并发布到pypi。
 		* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
 		* pypi上的package名称为paddlepaddle和paddlepaddle_gpu，如果要上传GPU版本的包，需要修改build/python/setup.py中，name: "paddlepaddle_gpu"并重新打包wheel包：`python setup.py bdist_wheel`。
@@ -20,9 +19,9 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 			pip install twine
 			twine upload dist/[package to upload]
 			```
-4. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-5. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
-6. 协同完成Release Note的书写
+		* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
+1. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
+1. 协同完成Release Note的书写
 
 
 需要注意的是:
@@ -30,7 +29,31 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 * `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试PaddlePaddle的行为。
 * 在`release/版本号`分支存在的时候，如果有bugfix的行为，需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
 
-# PaddlePaddle 分支规范
+## 发布wheel包到pypi
+
+使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+完成自动化二进制编译，参考下图，选择需要发布的版本（通常包含一个CPU版本和一个GPU版本），点击"run"右侧的"..."按钮，可以
+弹出下面的选择框，在第二个tab (Changes)里选择需要发布的分支，这里选择0.11.0，然后点击"Run Build"按钮。等待编译完成后
+可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。然后按照上述的方法
+使用`twine`工具上传即可。
+
+<img src="ci_build_whl.png">
+
+* 注：CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
+  发型版，如果需要手动编译，也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
+* pypi不支持覆盖上传，所以一个版本号的wheel包发布之后，不可以更改。下一个wheel包需要更新版本号才可以上传。
+
+## 发布Docker镜像
+
+上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub，所以，发布Docker镜像只需要对自动push的镜像打上
+版本号对应的tag即可：
+
+1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。
+1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`，latest tag可以是latest或latest-gpu等。
+1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]`
+1. 执行 `docker push paddlepaddle/paddle:[version]`
+
+## PaddlePaddle 分支规范
 
 PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。
 
@@ -47,11 +70,11 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 
 * BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
 
-# PaddlePaddle回归测试列表
+## PaddlePaddle回归测试列表
 
 本列表说明PaddlePaddle发版之前需要测试的功能点。
 
-## PaddlePaddle Book中所有章节
+### PaddlePaddle Book中所有章节
 
 PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
 
diff --git a/doc/design/support_new_device.md b/doc/design/support_new_device.md
new file mode 100644
index 0000000000000000000000000000000000000000..4c5f10e2ecb9ec09b78926ca27552741d02d7cc9
--- /dev/null
+++ b/doc/design/support_new_device.md
@@ -0,0 +1,240 @@
+# Design Doc: Supporting new Device/Library
+
+## Background
+
+Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries flexibly and efficiently.
+
+On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example,Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
+
+On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
+
+So, how to support a new Device/Library in Fluid becomes a challenge.
+
+
+## Basic: Integrate A New Device/Library
+
+For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/read_source.md).
+
+There are mainly three parts that we have to consider while integrating a new device/library:
+
+- Place and DeviceContext: indicates the device id and manages hardware resources
+
+- Memory and Tensor: malloc/free data on certain device
+
+- Math Functor and OpKernel: implement computing unit on certain devices/libraries
+
+### Place and DeviceContext
+
+Please remind that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
+
+#### Place
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add corresponding `DevicePlace`.
+
+```
+        |   CPUPlace
+Place --|   CUDAPlace
+        |   FPGAPlace
+```
+
+And `Place` is defined as follows:
+
+```
+typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;
+```
+
+#### DeviceContext
+
+Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
+
+
+```
+                /->  CPUDeviceContext   
+DeviceContext ---->  CUDADeviceContext  
+                \->  FPGADeviceContext
+```
+
+An example of Nvidia GPU is as follows:
+
+- DeviceContext
+
+
+```
+class DeviceContext {
+  virtual Place GetPlace() const = 0;
+};  
+```
+
+
+- CUDADeviceContext
+
+
+```
+class CUDADeviceContext : public DeviceContext {
+  Place GetPlace() const override { return place_; }
+private:
+  CUDAPlace place_;
+  cudaStream_t stream_; 
+  cublasHandle_t cublas_handle_;
+  std::unique_ptr<Eigen::GpuDevice> eigen_device_;  // binds with stream_
+};
+```
+
+### Memory and Tensor
+
+
+#### memory module
+
+Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/memory/memory.h#L36):
+
+```
+template <typename Place>
+void* Alloc(Place place, size_t size);
+
+template <typename Place>
+void Free(Place place, void* ptr);
+
+template <typename Place>
+size_t Used(Place place);
+```
+
+To implement these interfaces, we have to implement MemoryAllocator for different Devices.
+
+
+#### Tensor
+
+[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h#L36) holds data with some shape in a specific Place.
+
+```cpp
+class Tensor {
+ public:
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+ private:
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
+  DDim dim_;
+};
+```
+
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
+
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+
+
+
+### Math Functor and OpKernel
+
+Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors.
+
+Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
+
+The interface is defined in header file.
+
+```
+template <typename DeviceContext, typename T>
+class MaxOutFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* output, int groups);
+};
+```
+
+CPU implemention is in .cc file
+
+```
+template <typename T>
+class MaxOutFunctor<platform::CPUDeviceContext, T> {
+  public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};
+```
+
+CUDA implemention is in .cu file
+
+```
+template <typename T>
+class MaxOutFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};                  
+```
+
+
+We get computing handle from a concrete DeviceContext, and make compution on tensors.
+
+The implemention of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
+
+Fluid provides different register interfaces in op_registry.h
+
+
+Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/crop_op.cc#L134) operator as an example:
+
+In .cc file:
+
+```
+REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CPU_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
+```
+
+In .cu file:
+
+```
+REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CUDA_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
+```
+
+
+## Advanced topics: How to switch between different Device/Library
+
+Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
+
+
+For more details, please refer to following docs:
+
+- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md)
+- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md)
diff --git a/doc/design/switch_kernel.md b/doc/design/switch_kernel.md
new file mode 100644
index 0000000000000000000000000000000000000000..1846e5d9f99dd433b44ac6b5ae52893ec8f0d451
--- /dev/null
+++ b/doc/design/switch_kernel.md
@@ -0,0 +1,66 @@
+## Background
+Every operator has many kernels because there are multiple data types, places, data layout that Fluid supports. We use the `KernelType` to describe kernel types that operators can hold. 
+
+The `KernelType` is as follows.
+
+```
+struct KernelType {
+  Place place_;
+  DataType data_type_;
+  LayoutType layout_;
+};
+```
+
+The `place_` is a descriptor of the device and the computational library, e.g., `MKLDNNPlace`, `CUDAPlace`.
+
+The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float`/`double`.
+
+The `layout` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
+
+## Problem
+
+We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations.
+
+1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel.
+2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
+3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
+
+Problems under these situations are similar. We can formalise this problem as follow.
+
+We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
+
+## Solution
+
+It is clearly that transforming inputs of an operator toadapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
+
+We can infer a kernel type from the inputs of an operators. We let this kernel type as `actual kernel type`, which means this kernel type is the actually kernel type that operator should be performed.
+
+We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
+
+We transform the input data from `actual` to `expect` if the expect kernel type is not as same as actual kernel type.
+
+The algorithm is described as follow
+
+```cpp
+using DataTransformationFN = std::function<void(const Tensor& in, Tensor* out)>;
+using KernelTypePair = std::pair<KernelType, KernelType>;
+
+map<KernelTypePair, DataTransformationFN> g_data_transformation_;
+
+void OpWithKernel::Run() {
+  vec<Tensor> inputs = ...
+  auto actual_kernel_type = GetActualKernelType(inputs);
+  
+  // The expected kernel type is related to actual kernel type.
+  // For the most operators, the expected kernel type is as same as
+  // actual kernel type.
+  //
+  // So we pass `actual_kernel_type` as a parameter of 
+  // GetExpectedKernelType
+  auto expect_kernel_type = GetExpectedKernelType(actual_kernel_type);
+  
+  auto trans = g_data_transformation_[{actual_kernel_type, expect_kernel_type}];
+  
+  kernel.run(trans(inputs));
+}
+```
diff --git a/doc/design/var_desc.md b/doc/design/var_desc.md
index 0b2958c1b10ef6a6ce51aa75f61e15a7f2d94b3f..89fa95326c5c4909137544c6b5fd574e1281abe2 100644
--- a/doc/design/var_desc.md
+++ b/doc/design/var_desc.md
@@ -1,12 +1,12 @@
 ## Background
-PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime.
+PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.
 
-PaddlePaddle use proto message to describe compile time graph because
+PaddlePaddle use proto message to describe compile time program because
 
-1. Computation graph should be able to be saved to a file.
-1. In distributed training, the graph will be serialized and send to multiple workers.
+1. The computation program description must be serializable and saved in a file.
+1. During distributed training, the sreialized program will be sent to multiple workers. It should also be possible to break the program into different components, each of which can be executed on different workers.
 
-The computation graph is constructed by Data Node and Operation Node. The concept to represent them is in the table below.
+The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`)  and  `Operations`. The concept to represent them is in the table below.
 
 | |compile time|runtime|
 |---|---|---|
diff --git a/doc/faq/build_and_install/index_cn.rst b/doc/faq/build_and_install/index_cn.rst
index f1677e216f31d79b53ac29a0afbf6fbb886a0dcd..ed8a0c7e87da133138ecfc7ba6a8217d58b8f71d 100644
--- a/doc/faq/build_and_install/index_cn.rst
+++ b/doc/faq/build_and_install/index_cn.rst
@@ -14,7 +14,7 @@
 
     $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
     $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
 
 更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/doc_cn/build_and_install/install/docker_install.html>`_ 。
 
@@ -109,3 +109,31 @@ PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二
 解决办法是：
 
 * 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包，使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面，单元测试会引用site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。同时，即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
+
+8. 下载MKLML库失败
+------------------
+
+..  code-block:: bash
+
+    make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] 错误 4
+    make[1]: *** [CMakeFiles/extern_mklml.dir/all] 错误 2
+    make[1]: *** 正在等待未完成的任务....
+
+原因：网速或SSL链接原因，导致MKLML库下载不成功。
+
+解决办法是：手动下载并安装，具体步骤如下。
+
+..  code-block:: bash
+
+    // 1. 进入对应的目录
+    cd build/third_party/mklml/src/extern_mklml
+
+    // 2. 查看包的大小， 正常情况下是75M，如果小于75M，即下载失败：
+    du -sh mklml_lnx_2018.0.1.20171007.tgz
+
+    // 3. 手动下载且解压缩，并手动生成download成功标签：
+    wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz 
+    tar zxf mklml_lnx_2018.0.1.20171007.tgz
+    touch ../extern_mklml-stamp/extern_mklml-download
+
+    // 4. 接着编译即可
diff --git a/doc/faq/local/src/reduce_min_pool_size.py b/doc/faq/local/src/reduce_min_pool_size.py
index 5715397cc11e18246b8522fcc5b4f05780c9a0a7..96073633d2b45bf83927b8cf446919fe916438c2 100644
--- a/doc/faq/local/src/reduce_min_pool_size.py
+++ b/doc/faq/local/src/reduce_min_pool_size.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 @provider(min_pool_size=0, ...)
 def process(settings, filename):
     os.system('shuf %s > %s.shuf' % (filename, filename))  # shuffle before.
diff --git a/doc/faq/local/src/word2vec_config.py b/doc/faq/local/src/word2vec_config.py
index 866b40c3d4c96c1213b3f716f29b14dd38763edb..03619b2628ffca6166e8784222b7ea0196194b82 100644
--- a/doc/faq/local/src/word2vec_config.py
+++ b/doc/faq/local/src/word2vec_config.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 ...  # the settings and define data provider is omitted.
 DICT_DIM = 3000  # dictionary dimension.
 word_ids = data_layer('word_ids', size=DICT_DIM)
diff --git a/doc/faq/local/src/word2vec_dataprovider.py b/doc/faq/local/src/word2vec_dataprovider.py
index ec2753a7d01d7dd4d804c3bed0bac1be9c3fb3d3..a439a8f52ebc13ef281012e647834fd53a924d74 100644
--- a/doc/faq/local/src/word2vec_dataprovider.py
+++ b/doc/faq/local/src/word2vec_dataprovider.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 DICT_DIM = 3000
 
 
diff --git a/doc/getstarted/basic_usage/index_cn.rst b/doc/getstarted/basic_usage/index_cn.rst
deleted file mode 100644
index b473944fc7fb89d3e0a0b330933f2226734bb5bd..0000000000000000000000000000000000000000
--- a/doc/getstarted/basic_usage/index_cn.rst
+++ /dev/null
@@ -1,108 +0,0 @@
-经典的线性回归任务
-==================
-
-PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。
-
-任务简介
---------
-
-我们展示如何用PaddlePaddle解决 `单变量的线性回归 <https://www.baidu.com/s?wd=单变量线性回归>`_ 问题。线性回归的输入是一批点 `(x, y)` ，其中 `y = wx + b + ε`， 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。
-
-一个例子是房产估值。我们假设房产的价格（y）是其大小（x）的一个线性函数，那么我们可以通过收集市场上房子的大小和价格，用来估计线性函数的参数w 和 b。
-
-准备数据
------------
-
-假设变量 `x` 和 `y` 的真实关系为： `y = 2x + 0.3 + ε`，这里展示如何使用观测数据来拟合这一线性关系。首先，Python代码将随机产生2000个观测点，作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。
-
-.. code-block:: python
-
-    # dataprovider.py
-    from paddle.trainer.PyDataProvider2 import *
-    import random
-
-    # 定义输入数据的类型: 2个浮点数
-    @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-    def process(settings, input_file):
-        for i in xrange(2000):
-            x = random.random()
-            yield [x], [2*x+0.3]
-
-训练模型
------------
-
-为了还原 `y = 2x + 0.3`，我们先从一条随机的直线 `y' = wx + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小，最终趋于接近。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
-
-在PaddlePaddle里，该模型的网络配置如下。
-
-.. code-block:: python
-
-    # trainer_config.py
-    from paddle.trainer_config_helpers import *
-
-    # 1. 定义数据来源，调用上面的process函数获得观测数据
-    data_file = 'empty.list'
-    with open(data_file, 'w') as f: f.writelines(' ')
-    define_py_data_sources2(train_list=data_file, test_list=None, 
-                            module='dataprovider', obj='process',args={})
-
-    # 2. 学习算法。控制如何改变模型参数 w 和 b
-    settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-    # 3. 神经网络配置
-    x = data_layer(name='x', size=1)
-    y = data_layer(name='y', size=1)
-    # 线性计算网络层: ȳ = wx + b
-    ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-    # 计算误差函数，即  ȳ 和真实 y 之间的距离
-    cost = square_error_cost(input= ȳ, label=y)
-    outputs(cost)
-
-
-这段简短的配置展示了PaddlePaddle的基本用法：
-
-- 第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的 `process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
-
-- 第二部分主要是选择学习算法，它定义了模型参数改变的规则。PaddlePaddle提供了很多优秀的学习算法，这里使用一个基于momentum的随机梯度下降(SGD)算法，该算法每批量(batch)读取12个采样数据进行随机梯度计算来更新更新。
-
-- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络层，所以很多时候你需要做的只是定义正确的网络层并把它们连接起来。这里使用了三种网络单元：
-    
-    - **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
-    - **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
-    - **回归误差代价层**：回归误差代价层 `square_error_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
-
-定义了网络结构并保存为 `trainer_config.py` 之后，运行以下训练命令：
-
-.. code-block:: bash
-
-    paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
-
-PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加误差代价函数的输出在不断的减小，这意味着模型在训练数据上不断的改进，直到逼近真实解：` y = 2x + 0.3 `
-
-模型检验
------------
-
-训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测，评价预测的效果。在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
-
-PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
-
-.. code-block:: python
-
-    import numpy as np
-    import os
-
-    def load(file_name):
-        with open(file_name, 'rb') as f:
-            f.read(16) # skip header for float type.
-            return np.fromfile(f, dtype=np.float32)
-        
-    print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-    # w=1.999743, b=0.300137
-
-.. image:: ./parameters.png
-     :align: center
-     :scale: 80 %
-
-从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型一致。
-
-这样，我们用PaddlePaddle解决了单变量线性回归问题， 包括数据输入、模型训练和最后的结果验证。
diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst
deleted file mode 100644
index 2cc438ebbe0f97345d25354b93b4ebbd43502415..0000000000000000000000000000000000000000
--- a/doc/getstarted/basic_usage/index_en.rst
+++ /dev/null
@@ -1,101 +0,0 @@
-Simple Linear Regression
-========================
-
-PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
-
-Problem Background
-------------------
-
-Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression <https://en.wikipedia.org/wiki/Simple_linear_regression>`_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
-
-Prepare the Data
------------------
-
-Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
-
-    .. code-block:: python
-
-        # dataprovider.py
-        from paddle.trainer.PyDataProvider2 import *
-        import random
-
-        # define data types of input: 2 real numbers
-        @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-        def process(settings, input_file):
-            for i in xrange(2000):
-                x = random.random()
-                yield [x], [2*x+0.3]
-
-Train a NeuralNetwork
-----------------------
-
-To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle:
-
-    .. code-block:: python
-
-        # trainer_config.py
-        from paddle.trainer_config_helpers import *
-
-        # 1. read data. Suppose you saved above python code as dataprovider.py
-        data_file = 'empty.list'
-        with open(data_file, 'w') as f: f.writelines(' ')
-        define_py_data_sources2(train_list=data_file, test_list=None, 
-                module='dataprovider', obj='process',args={})
-
-        # 2. learning algorithm
-        settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-        # 3. Network configuration
-        x = data_layer(name='x', size=1)
-        y = data_layer(name='y', size=1)
-        y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-        cost = square_error_cost(input=y_predict, label=y)
-        outputs(cost)
-
-Some of the most fundamental usages of PaddlePaddle are demonstrated:
-
--  The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
-
--  The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
-
--  Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
-	-  **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for ``X`` and ``Y``.
-	-  **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
-	-  **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
-
-Now that everything is ready, you can train the network with a simple command line call:
-
-    .. code-block:: bash
- 
-        paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
- 
-
-This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
-
-
-Evaluate the Model
--------------------
-
-Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly.
-
-In PaddlePaddle, training is just to get a collection of model parameters, which are ``w`` and ``b`` in this case. Each parameter is saved in an individual file in the popular ``numpy`` array format. Here is the code that reads parameters from last pass.
-
-    .. code-block:: python
-
-        import numpy as np
-        import os
-
-        def load(file_name):
-            with open(file_name, 'rb') as f:
-                f.read(16) # skip header for float type.
-                return np.fromfile(f, dtype=np.float32)
-                
-        print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-        # w=1.999743, b=0.300137
-
-    .. image:: parameters.png
-        :align: center
-
-Although starts from a random guess, you can see that value of ``w`` changes quickly towards 2 and ``b`` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
-
-There, you have recovered the underlying pattern between ``X`` and ``Y`` only from observed data.
diff --git a/doc/getstarted/basic_usage/parameters.png b/doc/getstarted/basic_usage/parameters.png
deleted file mode 100644
index 2ec67480951e21f0400bce1c34b3108dcd65c18c..0000000000000000000000000000000000000000
Binary files a/doc/getstarted/basic_usage/parameters.png and /dev/null differ
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..71904dc41ed0d946867d890cc585e1b88450ca8c
--- /dev/null
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -0,0 +1,151 @@
+从源码编译
+======================
+
+.. _build_step:
+
+编译方法
+----------------
+
+PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译工具。
+我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
+
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
+
+编译PaddlePaddle，需要执行：
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 如果使用Docker编译环境，执行下面的命令编译CPU-Only的二进制
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   # 如果不使用Docker编译环境，执行下面的命令
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+
+编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+如果机器中已经安装过PaddlePaddle，有两种方法：
+
+.. code-block:: bash
+
+   1. 先卸载之前的版本，再重新安装
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. 直接升级到更新的版本
+   pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+执行单元测试
+----------------
+
+如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
+
+使用Docker的情况下，设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+
+如果不使用Docker，可以执行ctest命令即可：
+
+.. code-block:: bash
+
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+   ctest
+   # 指定执行其中一个单元测试 test_mul_op
+   ctest -R test_mul_op
+
+.. _compile_deps:
+
+编译依赖
+----------------
+
+PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
+
+.. csv-table:: PaddlePaddle编译依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.2", ""
+   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "可选"
+
+
+.. _build_options:
+
+编译选项
+----------------
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
+用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考
+`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如：
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: 编译选项说明
+    :header: "选项", "说明", "默认值"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "是否支持GPU", "ON"
+    "WITH_C_API", "是否仅编译CAPI", "OFF"
+    "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
+    "WITH_DSO", "是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。", "ON"
+    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
+    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
+    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
+    "WITH_TESTING", "是否开启单元测试", "ON"
+    "WITH_DOC", "是否编译中英文文档", "OFF"
+    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
+    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
+
+BLAS
++++++
+
+PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
+`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
+还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+
+如果关闭MKL，则会使用OpenBLAS作为BLAS库。
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
+使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。
+
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。
+我们推荐使用最新版本的cuDNN。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（** :code:`rm -rf` ）**后，再指定。**
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
deleted file mode 100644
index 2f1461489495618718d5abaeab9cbeda9b93700f..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ /dev/null
@@ -1,236 +0,0 @@
-Installing from Sources
-==========================
-
-* [1. Download and Setup](#download)
-* [2. Requirements](#requirements)
-* [3. Build on Ubuntu](#ubuntu)
-* [4. Build on Centos](#centos)
-
-
-## <span id="download">Download and Setup</span> 
-You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
-
-```bash
-git clone https://github.com/PaddlePaddle/Paddle paddle
-cd paddle
-```
-## <span id="requirements">Requirements</span>
-
-To compile the source code, your computer must be equipped with the following dependencies.
-
-- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler
-- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
-- **BLAS**: MKL, OpenBlas or ATLAS
-- **Python**: only support Python 2.7
-- **Go**
-
-**Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
-For CUDA 8.0, GCC versions later than 5.3 are not supported!
-
-### Options
-
-PaddlePaddle supports some build options. 
-
-<html>
-<table> 
-<thead>
-<tr>
-<th scope="col" class="left">Optional</th>
-<th scope="col" class="left">Description</th>
-</tr>
-</thead>
-<tbody>
-<tr><td class="left">WITH_GPU</td><td class="left">Compile PaddlePaddle with NVIDIA GPU</td></tr>
-<tr><td class="left">WITH_AVX</td><td class="left">Compile PaddlePaddle with AVX intrinsics</td></tr>
-<tr><td class="left">WITH_DSO</td><td class="left">Compile PaddlePaddle with dynamic linked CUDA</td></tr>
-<tr><td class="left">WITH_TESTING</td><td class="left">Compile PaddlePaddle with unit testing</td></tr>
-<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile PaddlePaddle with inference api</td></tr>
-<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile PaddlePaddle with style check</td></tr>
-<tr><td class="left">WITH_PYTHON</td><td class="left">Compile PaddlePaddle with python interpreter</td></tr>
-<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile PaddlePaddle with double precision</td></tr>
-<tr><td class="left">WITH_RDMA</td><td class="left">Compile PaddlePaddle with RDMA support</td></tr>
-<tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
-<tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
-<tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
-<tr><td class="left">WITH_COVERAGE</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
-<tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
-<tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
-</tbody>
-</table>
-</html>
-
-**Note:**
-  - The GPU version works best with Cuda Toolkit 8.0 and cuDNN v5.
-  - Other versions like Cuda Toolkit 7.0, 7.5 and cuDNN v3, v4 are also supported.
-  - **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.**
-
-As a simple example, consider the following:  
-
-1. **BLAS Dependencies(optional)**
-  
-    CMake will search BLAS libraries from the system. If not found, OpenBLAS will be downloaded, built and installed automatically.
-    To utilize preinstalled BLAS， you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
-
-    ```bash
-    # specify MKL
-    cmake .. -DMKL_ROOT=<mkl_path>
-    # or specify OpenBLAS
-    cmake .. -DOPENBLAS_ROOT=<openblas_path>
-    ```
-
-2. **Doc Dependencies(optional)**
-
-    To generate PaddlePaddle's documentation, install dependencies and set `-DWITH_DOC=ON` as follows:
-
-    ```bash
-    pip install 'sphinx>=1.4.0'
-    pip install sphinx_rtd_theme recommonmark
-
-    # install doxygen on Ubuntu
-    sudo apt-get install doxygen 
-    # install doxygen on Mac OS X
-    brew install doxygen
-
-    # active docs in cmake
-    cmake .. -DWITH_DOC=ON`
-    ```
-
-## <span id="ubuntu">Build on Ubuntu 14.04</span>
-
-### Install Dependencies
-
-- **Paddle Dependencies**
-
-    ```bash
-    # necessary
-    sudo apt-get update
-    sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake
-    sudo apt-get install -y python python-pip python-numpy libpython-dev bison
-    sudo pip install 'protobuf==3.1.0.post1'
-
-    # Install Go
-    # You can follow https://golang.org/doc/install for a detailed explanation.
-    wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
-    tar -C $HOME -xzf go.tgz && \
-    mkdir $HOME/gopath && \
-    rm go.tgz
-
-    # Setup environment variables
-    export GOROOT=$HOME/go
-    export GOPATH=$HOME/gopath
-    export PATH=$PATH:$GOROOT/bin
-
-    # install cmake 3.4
-    curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
-        cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \
-        cd .. && rm -rf cmake-3.4.1
-    ```
-
-- **GPU Dependencies (optional)**
-
-    To build GPU version, you will need the following installed:
-
-        1. a CUDA-capable GPU
-        2. A supported version of Linux with a GCC compiler and toolchain
-        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
-
-    The CUDA development environment relies on tight integration with the host development environment,
-    including the host compiler and C runtime libraries, and is therefore only supported on
-    distribution versions that have been qualified for this CUDA Toolkit release.
-        
-    After downloading cuDNN library, issue the following commands:
-
-    ```bash
-    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-    ```
-    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
-
-    ```bash
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
-    ```
-
-### Build and Install
-
-As usual, the best option is to create build folder under paddle project directory.
-
-```bash
-mkdir build && cd build
-``` 
-
-Finally, you can build and install PaddlePaddle:
-
-```bash
-# you can add build option here, such as:    
-cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
-# please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
-# set PaddlePaddle installation path in ~/.bashrc
-export PATH=<path to install>/bin:$PATH
-# install PaddlePaddle Python modules.
-sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-```
-
-## <span id="centos">Build on Centos 7</span>
-
-### Install Dependencies
-
-- **CPU Dependencies**
-
-    ```bash
-    # necessary
-    sudo yum update
-    sudo yum install -y epel-release
-    sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git
-    sudo pip install wheel numpy
-    sudo pip install 'protobuf>=3.0.0'
-    ```
-  
-- **GPU Dependencies (optional)**
-
-    To build GPU version, you will need the following installed:
-
-        1. a CUDA-capable GPU
-        2. A supported version of Linux with a GCC compiler and toolchain
-        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
-
-    The CUDA development environment relies on tight integration with the host development environment,
-    including the host compiler and C runtime libraries, and is therefore only supported on
-    distribution versions that have been qualified for this CUDA Toolkit release.
-        
-    After downloading cuDNN library, issue the following commands:
-
-    ```bash
-    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-    ```
-    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
-
-    ```bash
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
-    ```
-
-### Build and Install
-
-As usual, the best option is to create build folder under paddle project directory.
-
-```bash
-mkdir build && cd build
-``` 
-
-Finally, you can build and install PaddlePaddle:
-  
-```bash
-# you can add build option here, such as:    
-cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
-# please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
-# set PaddlePaddle installation path in ~/.bashrc
-export PATH=<path to install>/bin:$PATH
-# install PaddlePaddle Python modules.
-sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-```
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..27f73b2e2c029b41d514e1612912ed1c335605b6
--- /dev/null
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -0,0 +1,169 @@
+Build from Sources
+==========================
+
+.. _build_step:
+
+How To Build
+----------------
+
+PaddlePaddle mainly uses `CMake <https://cmake.org>`_ and GCC, G++ as compile
+tools. We recommend you to use our pre-built Docker image to run the build
+to avoid installing dependencies by yourself. We have several build environment
+Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ .
+
+If you choose not to use Docker image for your build, you need to install the
+below `Compile Dependencies`_ before run the build.
+
+Then run:
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # run the following command to build a CPU-Only binaries if you are using docker
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   # else run these commands
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+
+When the compile finishes, you can get the output whl package under
+build/python/dist, then you can choose to install the whl on local
+machine or copy it to the target machine.
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+If the machine has installed PaddlePaddle before, there are two methods:
+
+.. code-block:: bash
+
+   1. uninstall and reinstall
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. upgrade directly
+   pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+Run Tests
+----------------
+
+If you wish to run the tests, you may follow the below steps:
+
+When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
+Set :code:`WITH_GPU=ON` Can also run tests on GPU.
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
+
+If you don't use Docker, just run ctest will start the tests:
+
+.. code-block:: bash
+
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=ON ..
+   make
+   ctest
+   # run a single test like test_mul_op
+   ctest -R test_mul_op
+
+
+.. _compile_deps:
+
+Compile Dependencies
+----------------
+
+PaddlePaddle need the following dependencies when compiling, other dependencies
+will be downloaded automatically.
+
+.. csv-table:: PaddlePaddle Compile Dependencies
+   :header: "Dependency", "Version", "Description"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.2", ""
+   "GCC", "4.8.2", "Recommend devtools2 for CentOS"
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "Optional"
+
+
+.. _build_options:
+
+Build Options
+----------------
+
+Build options include whether build binaries for CPU or GPU, which BLAS
+library to use etc. You may pass these settings when running cmake.
+For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`_ 。
+
+.. _build_options_bool:
+
+Bool Type Options
+----------------
+
+You can add :code:`-D` argument to pass such options, like:
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool Type Options
+    :header: "Option", "Description", "Default"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "Build with GPU support", "ON"
+    "WITH_C_API", "Build only CAPI", "OFF"
+    "WITH_DOUBLE", "Build with double precision", "OFF"
+    "WITH_DSO", "Dynamically load CUDA libraries", "ON"
+    "WITH_AVX", "Build with AVX support", "ON"
+    "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
+    "WITH_STYLE_CHECK", "Check code style when building", "ON"
+    "WITH_TESTING", "Build unit tests", "ON"
+    "WITH_DOC", "Build documentations", "OFF"
+    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
+    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
+    "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
+
+
+BLAS
++++++
+
+PaddlePaddle supports `MKL <https://software.intel.com/en-us/intel-mkl>`_ and
+`OpenBlAS <http://www.openblas.net/>`_ as BLAS library。By default it uses MKL.
+If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded
+and used, for more `details <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ .
+
+If you choose not to use MKL, then OpenBlAS will be used.
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle will automatically find CUDA and cuDNN when compiling and running.
+parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture
+automatically in order to speed up the build.
+
+PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to
+keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN
+you built.
+
+Pass Compile Options
+++++++++++++++
+
+You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
+When running cmake command, it will search system paths like
+:code:`/usr/lib:/usr/local/lib` and then search paths that you
+passed to cmake, i.e.
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.**
diff --git a/doc/getstarted/build_and_install/cmake.png b/doc/getstarted/build_and_install/cmake.png
deleted file mode 100644
index a58cd09ad99cf27cc1ca5785fe54d726b83a82f6..0000000000000000000000000000000000000000
Binary files a/doc/getstarted/build_and_install/cmake.png and /dev/null differ
diff --git a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
deleted file mode 100644
index be0c1ffa451b2901ec06621dd4d886f800b4562e..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
+++ /dev/null
@@ -1,43 +0,0 @@
-PaddlePaddle的编译选项
-======================
-
-PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
-
-Bool型的编译选项
-----------------
-用户可在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如
-
-..  code-block:: bash
-
-    cmake .. -DWITH_GPU=OFF
-
-..  csv-table:: Bool型的编译选项
-    :widths: 1, 7, 2
-    :file: compile_options.csv
-
-BLAS/CUDA/Cudnn的编译选项
---------------------------
-BLAS
-+++++
-
-PaddlePaddle支持以下任意一种BLAS库：`MKL <https://software.intel.com/en-us/intel-mkl>`_ ，`ATLAS <http://math-atlas.sourceforge.net/>`_ ，`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
-
-..  csv-table:: BLAS路径相关的编译选项
-    :widths: 1, 2, 7
-    :file: cblas_settings.csv
-
-CUDA/Cudnn
-+++++++++++
-
-PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
-
-编译选项的设置
-++++++++++++++
-
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时，首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
-
-..  code-block:: bash
-
-    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
-
-注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
diff --git a/doc/getstarted/build_and_install/cmake/cblas_settings.csv b/doc/getstarted/build_and_install/cmake/cblas_settings.csv
deleted file mode 100644
index a6356baf16a0d3d2499e39d2055d8ee878dcaef2..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/cmake/cblas_settings.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-编译选项,描述,注意
-MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h，${MKL_ROOT}/lib目录下需要包含mkl_core，mkl_sequential和mkl_intel_lp64三个库。
-ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h，${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。
-OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h，${OPENBLAS_ROOT}/lib下需要包含openblas库。
-REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h，${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/cmake/compile_options.csv b/doc/getstarted/build_and_install/cmake/compile_options.csv
deleted file mode 100644
index 463b825470579d0c3736a408b1e82dd33e6f8d42..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/cmake/compile_options.csv
+++ /dev/null
@@ -1,12 +0,0 @@
-选项,说明,默认值
-WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
-WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA,否
-WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
-WITH_DOC,是否编译中英文文档,否
-WITH_SWIG_PY,是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 0d34dec8e908c5e61001500725187a2233797f46..bae42593ddc6f7a7eb47d603752ad6efa9820b45 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -1,222 +1,139 @@
-PaddlePaddle的Docker容器使用方式
+使用Docker安装运行
 ================================
 
-PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行。 请注意，您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
+使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
+您可以在 `Docker官网 <https://docs.docker.com/get-started/>`_ 获得基本的Docker安装和使用方法。
 
-Docker使用入门
-------------------------------
-
-几个基础的概念帮助理解和使用Docker：
+如果您在使用Windows，可以参考
+`这篇 <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+教程，完成在Windows上安装和使用Docker。
 
-- *镜像*：一个Docker镜像是一个打包好的软件。它包含了这个软件本身和它所依赖的运行环境。PaddlePaddle的Docker镜像就包含了PaddlePaddle的Python库以及其依赖的多个Python库。这样我们可以直接在Docker中运行需要的程序而不需要安装后在执行。可以执行：
+在了解Docker的基本使用方法之后，即可开始下面的步骤：
 
-  .. code-block:: bash
+.. _docker_pull:
 
-     docker images
+获取PaddlePaddle的Docker镜像
+------------------------------
 
-  来列出当前系统中的所有镜像，同样可以执行：
+执行下面的命令获取最新的PaddlePaddle Docker镜像，版本为cpu_avx_mkl：
 
   .. code-block:: bash
-		  
-     docker pull paddlepaddle/paddle:0.10.0
 
-  来下载Docker镜像，paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的，推荐国内用户使用docker.paddlepaddle.org/paddle下载。
+     docker pull paddlepaddle/paddle
 
-- *容器*： 如果说一个Docker镜像就是一个程序，那容器就是这个程序运行时产生的“进程”。
-  实际上，一个容器就是一个操作系统的进程，但是是运行在独立的进程空间，文件系统以及网络之上。
-  可以执行：
+对于国内用户，我们提供了加速访问的镜像源：
 
   .. code-block:: bash
 
-     docker run paddlepaddle/paddle:0.10.0
+     docker pull docker.paddlepaddle.org/paddle
 
-  来使用一个镜像启动一个容器。
-
-- 默认情况下，Docker容器会运行在独立的文件系统空间之上，我们无法在Docker容器中
-  访问到主机上的文件。可以通过*挂载Volume*的方式，将主机上的文件或目录挂载到
-  Docker容器中。下面的命令把当前目录挂载到了容器中的 /data 目录下，容器使用
-  debian镜像，并且启动后执行 :code:`ls /data`。
+下载GPU版本（cuda8.0_cudnn5_avx_mkl）的Docker镜像：
 
   .. code-block:: bash
 
-     docker run --rm -v $(pwd):/data debian ls /data
-
-PaddlePaddle发布的Docker镜像使用说明
-------------------------------
-
-我们把PaddlePaddle的编译环境打包成一个镜像，称为开发镜像，里面涵盖了
-PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打包成一个镜
-像，称为生产镜像，里面涵盖了PaddlePaddle运行所需的所有环境。每次
-PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运
-行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在
-`dockerhub.com <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 
-和国内镜像`docker.paddlepaddle.org` 提供最新
-的Docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。
-
-**注意：为了方便在国内的开发者下载Docker镜像，我们提供了国内的镜像服务器供大家使用。如果您在国内，请把文档里命令中的paddlepaddle/paddle替换成docker.paddlepaddle.org/paddle。**
-
-1. 开发镜像：:code:`paddlepaddle/paddle:0.10.0-dev`
-
-   这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境，完成开发，编译，发布，
-   文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具，所以如果需要自行配置开发环境需要考虑版本的因素。
-   开发镜像包含了以下工具：
-   
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
-   很多开发者会使用远程的安装有GPU的服务器工作，用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作，
-   也可以在开发镜像中启动一个SSHD服务，方便开发者直接登录到镜像中进行开发:
-
-   以交互容器方式运行开发镜像：
-
-   .. code-block:: bash
-
-      docker run -it --rm -v $(pwd):/paddle  paddlepaddle/paddle:0.10.0-dev /bin/bash
-
-   或者，可以以后台进程方式运行容器：
-
-   .. code-block:: bash
-
-      docker run -d -p 2202:22 -p 8888:8888 -v $(pwd):/paddle paddlepaddle/paddle:0.10.0-dev /usr/sbin/sshd -D
-
-   然后用密码 :code:`root` SSH进入容器：
-
-   .. code-block:: bash
-
-      ssh -p 2202 root@localhost
-
-   SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
-
-2. 生产镜像：根据CPU、GPU和非AVX区分了如下4个镜像：
-
-   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
-   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
-   - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
-   - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
-
-   纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
-
-   .. code-block:: bash
-
-      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-   如果输出是No，就需要选择使用no-AVX的镜像
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddle.org/paddle:latest-gpu
 
-   **注：在0.10.0之后的版本，PaddlePaddle都可以自动判断硬件是否支持AVX，所以无需判断AVX即可使用**
+选择下载使用不同的BLAS库的Docker镜像：
 
-   以上方法在GPU镜像里也能用，只是请不要忘记提前在物理机上安装GPU最新驱动。
-   为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
-
-   .. code-block:: bash
+  .. code-block:: bash
 
-      nvidia-docker run -it --rm paddledev/paddle:0.10.0-gpu /bin/bash
+     # 默认是使用MKL的镜像
+     docker pull paddlepaddle/paddle
+     # 使用OpenBLAS的镜像
+     docker pull paddlepaddle/paddle:latest-openblas
 
-   注意: 如果使用nvidia-docker存在问题，你也许可以尝试更老的方法，具体如下，但是我们并不推荐这种方法。：
+下载指定版本的Docker镜像，可以从 `DockerHub网站 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag，并执行下面的命令：
 
-   .. code-block:: bash
-
-      export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-      export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-      docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0-gpu
+  .. code-block:: bash
 
-3. 运行以及发布您的AI程序
+     docker pull paddlepaddle/paddle:[tag]
+     # 比如：
+     docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
 
-   假设您已经完成了一个AI训练的python程序 :code:`a.py`，这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行：
+.. _docker_run:
 
-   .. code-block:: bash
+在Docker中执行PaddlePaddle训练程序
+----------------------------------
 
-      docker run -it -v $PWD:/work paddle /work/a.py
+假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+编写），就可以使用下面的命令开始执行训练：
 
-   如果要使用GPU，请运行：
+  .. code-block:: bash
 
-   .. code-block:: bash
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+ 
+上述命令中， :code:`-it` 参数说明容器已交互式运行； :code:`-v $PWD:/work`
+指定将当前路径（Linux中$PWD变量会展开为当前路径的绝对路径）挂载到容器内部的 :code:`/work`
+目录； :code:`paddlepaddle/paddle` 指定需要使用的容器； 最后 :code:`/work/train.py`
+为容器内执行的命令，即运行训练程序。
 
-      nvidia-docker run -it -v $PWD:/work paddle /work/a.py
+当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
 
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
 
-   这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像，可以编写`Dockerfile`使用`FROM paddledev/paddle:0.10.0`
-   创建和发布自己的AI程序镜像。
+**注：PaddlePaddle Docker镜像为了减小体积，默认没有安装vim，您可以在容器中执行** :code:`apt-get install -y vim` **安装后，在容器中编辑代码。**
 
-运行PaddlePaddle Book
----------------------
+.. _docker_run_book:
 
-Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
+使用Docker启动PaddlePaddle Book教程
+-----------------------------------
 
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
 PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
 如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程，或者制作和分享带有代码、公式、图表、文字的交互式文档。
 
 我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
 
-.. code-block:: bash
+  .. code-block:: bash
 
-    docker run -p 8888:8888 paddlepaddle/book
+     docker run -p 8888:8888 paddlepaddle/book
 
 然后在浏览器中输入以下网址：
 
-.. code-block:: text
+  .. code-block:: text
 
-    http://localhost:8888/
+     http://localhost:8888/
 
 就这么简单，享受您的旅程！
 
-通过Docker容器开发PaddlePaddle
-------------------------------
-
-开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux，Mac OS X和Windows。
-
-1. 制作PaddlePaddle开发镜像
+.. _docker_run_gpu:
 
-   PaddlePaddle每次发布新版本都会发布对应的开发镜像供开发者直接使用。这里介绍如生成造这个开发镜像。
-   生成Docker镜像的方式有两个，一个是直接把一个容器转换成镜像，另一个是创建Dockerfile并运行docker build指令按照Dockerfile生成镜像。第一个方法的好处是简单快捷，适合自己实验，可以快速迭代。第二个方法的好处是Dockerfile可以把整个生成流程描述很清楚，其他人很容易看懂镜像生成过程，持续集成系统也可以简单地复现这个过程。我们采用第二个方法。Dockerfile位于PaddlePaddle repo的根目录。生成生产镜像只需要运行：
+使用Docker执行GPU训练
+------------------------------
 
-   .. code-block:: bash
-      
-      git clone https://github.com/PaddlePaddle/Paddle.git
-      cd Paddle
-      docker build -t paddle:dev .
+为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_ 来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
 
-   docker build这个命令的-t指定了生成的镜像的名字，这里我们用paddle:dev。到此，PaddlePaddle开发镜像就被构建完毕了。
+  .. code-block:: bash
 
-2. 制作PaddlePaddle生产镜像
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
 
-   生产镜像的生成分为两步，第一步是运行：
+**注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
 
-   .. code-block:: bash
-      
-      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
-
-   以上命令会编译PaddlePaddle，生成运行程序，以及生成创建生产镜像的Dockerfile。所有生成的的文件都在build目录下。“WITH_GPU”控制生成的生产镜像是否支持GPU，“WITH_AVX”控制生成的生产镜像是否支持AVX，”WITH_TEST“控制是否生成单元测试。
+  .. code-block:: bash
 
-   第二步是运行：
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
 
-   .. code-block:: bash
-      
-      docker build -t paddle:prod -f build/Dockerfile ./build
+**关于AVX：**
 
-   以上命令会按照生成的Dockerfile把生成的程序拷贝到生产镜像中并做相应的配置，最终生成名为paddle:prod的生产镜像。
+AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
+`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。
 
-3. 运行单元测试
-
-   运行以下指令：
+以下指令能检查Linux电脑是否支持AVX：
 
    .. code-block:: bash
-      
-      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
-
-文档
-----
-
-Paddle的Docker开发镜像带有一个通过 `woboq code browser
-<https://github.com/woboq/woboq_codebrowser>`_ 生成的HTML版本的C++源代码，便于用户浏览C++源码。
 
-只要在Docker里启动PaddlePaddle的时候给它一个名字，就可以再运行另一个Nginx Docker镜像来服务HTML代码：
-
-.. code-block:: bash
-
-   docker run -d --name paddle-cpu-doc paddle:0.10.0-dev
-   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
 
-接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。
+如果输出是No，就需要选择使用no-AVX的镜像
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 94860240f6a4a9bed8a865684a8a79960489280e..56a7c68e4d39c45249fa55a964dc48b7081596a6 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -1,270 +1,146 @@
-PaddlePaddle in Docker Containers
+Run in Docker Containers
 =================================
 
-Docker container is currently the only officially-supported way to
-running PaddlePaddle.  This is reasonable as Docker now runs on all
-major operating systems including Linux, Mac OS X, and Windows.
-Please be aware that you will need to change `Dockers settings
-<https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use
-of your hardware resource on Mac OS X and Windows.
+Run PaddlePaddle in Docker container so that you don't need to care about
+runtime dependencies, also you can run under Windows system. You can get
+tutorials at `here <https://docs.docker.com/get-started/>`_ .
 
-Working With Docker
--------------------
+If you are using Windows, please refer to
+`this <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+tutorial to start running docker under windows.
 
-Docker is simple as long as we understand a few basic concepts:
+After you've read above tutorials you may proceed the following steps.
 
-- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type
+.. _docker_pull:
 
-  .. code-block:: bash
-
-     docker images
+Pull PaddlePaddle Docker Image
+------------------------------
 
-  to list all images in the system. We can also run
+Run the following command to download the latest Docker images, the version is cpu_avx_mkl:
 
   .. code-block:: bash
-		  
-     docker pull paddlepaddle/paddle:0.10.0
 
-  to download a Docker image, paddlepaddle/paddle in this example,
-  from Dockerhub.com.
+     docker pull paddlepaddle/paddle
 
-- *container*: considering a Docker image a program, a container is a
-  "process" that runs the image. Indeed, a container is exactly an
-  operating system process, but with a virtualized filesystem, network
-  port space, and other virtualized environment. We can type
+For users in China, we provide a faster mirror:
 
   .. code-block:: bash
 
-     docker run paddlepaddle/paddle:0.10.0
+     docker pull docker.paddlepaddle.org/paddle
 
-  to start a container to run a Docker image, paddlepaddle/paddle in this example.
-
-- By default docker container have an isolated file system namespace,
-  we can not see the files in the host file system. By using *volume*,
-  mounted files in host will be visible inside docker container.
-  Following command will mount current dirctory into /data inside
-  docker container, run docker container from debian image with
-  command :code:`ls /data`.
+Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
 
   .. code-block:: bash
 
-     docker run --rm -v $(pwd):/data debian ls /data
-
-Usage of CPU-only and GPU Images
-----------------------------------
-
-We package PaddlePaddle's compile environment into a Docker image,
-called the develop image, it contains all compiling tools that
-PaddlePaddle needs. We package compiled PaddlePaddle program into a
-Docker image as well, called the production image, it contains all
-runtime environment that running PaddlePaddle needs. For each version
-of PaddlePaddle, we release both of them. Production image includes
-CPU-only version and a CUDA GPU version and their no-AVX versions.
-
-We put the docker images on `dockerhub.com
-<https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_. You can find the
-latest versions under "tags" tab at dockerhub.com. 
-
-** NOTE: If you are in China, you can use our Docker image registry mirror to speed up the download process. To use it, please replace all paddlepaddle/paddle in the commands to docker.paddlepaddle.org/paddle.**
-
-
-1. development image :code:`paddlepaddle/paddle:<version>-dev`
-
-   This image has packed related develop tools and runtime
-   environment. Users and developers can use this image instead of
-   their own local computer to accomplish development, build,
-   releasing, document writing etc. While different version of paddle
-   may depends on different version of libraries and tools, if you
-   want to setup a local environment, you must pay attention to the
-   versions.  The development image contains:
-   
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
-     
-   Many developers use servers with GPUs, they can use ssh to login to
-   the server and run :code:`docker exec` to enter the docker
-   container and start their work.  Also they can start a development
-   docker image with SSHD service, so they can login to the container
-   and start work.
-
-2. Production images, this image might have multiple variants:
-
-   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
-   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
-   - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
-   - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
-
-   Please be aware that the CPU-only and the GPU images both use the
-   AVX instruction set, but old computers produced before 2008 do not
-   support AVX.  The following command checks if your Linux computer
-   supports AVX:
-
-   .. code-block:: bash
-
-      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-   **NOTE：versions after 0.10.0 will automatically detect system AVX support, so manual detect is not needed in this case.**
-   To run the CPU-only image as an interactive container:
-
-   .. code-block:: bash
-
-      docker run -it --rm paddlepaddle/paddle:0.10.0 /bin/bash
-
-   Above method work with the GPU image too -- the recommended way is
-   using `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_.
-
-   Please install nvidia-docker first following this `tutorial
-   <https://github.com/NVIDIA/nvidia-docker#quick-start>`_.
-
-   Now you can run a GPU image:
-
-   .. code-block:: bash
-
-      nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash
-
-
-Train Model Using Python API
-----------------------------
-
-Our official docker image provides a runtime for PaddlePaddle
-programs. The typical workflow will be as follows:
-
-Create a directory as workspace:
-
-.. code-block:: bash
-
-   mkdir ~/workspace
-
-Edit a PaddlePaddle python program using your favourite editor
-
-.. code-block:: bash
-
-   emacs ~/workspace/example.py
-
-Run the program using docker:
-
-.. code-block:: bash
-
-   docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 python /workspace/example.py
-
-Or if you are using GPU for training:
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddle.org/paddle:latest-gpu
 
-.. code-block:: bash
+Choose between different BLAS version:
 
-   nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu python /workspace/example.py
-
-Above commands will start a docker container by running :code:`python
-/workspace/example.py`. It will stop once :code:`python
-/workspace/example.py` finishes.
-
-Another way is to tell docker to start a :code:`/bin/bash` session and
-run PaddlePaddle program interactively:
-
-.. code-block:: bash
-
-   docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 /bin/bash
-   # now we are inside docker container
-   cd /workspace
-   python example.py
-
-Running with GPU is identical:
-
-.. code-block:: bash
-
-   nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu /bin/bash
-   # now we are inside docker container
-   cd /workspace
-   python example.py
-
-
-Develop PaddlePaddle or Train Model Using C++ API
----------------------------------------------------
-
-We will be using PaddlePaddle development image since it contains all
-compiling tools and dependencies.
+  .. code-block:: bash
 
-1. Build PaddlePaddle develop image
+     # image using MKL by default
+     docker pull paddlepaddle/paddle
+     # image using OpenBLAS
+     docker pull paddlepaddle/paddle:latest-openblas
 
-   Use following command to build PaddlePaddle develop image:
 
-   .. code-block:: bash
+If you want to use legacy versions, choose a tag from
+`DockerHub <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_
+and run:
 
-      git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle
-      docker build -t paddle:dev .
-
-2. Build PaddlePaddle production image
+  .. code-block:: bash
 
-   There are two steps for building production image, the first step is to run:
+     docker pull paddlepaddle/paddle:[tag]
+     # i.e.
+     docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
 
-   .. code-block:: bash
+.. _docker_run:
 
-      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
+Launch your training program in Docker
+--------------------------------------
 
-   The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated.
+Assume that you have already written a PaddlePaddle program
+named :code:`train.py` under directory :code:`/home/work` (refer to 
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+for more samples), then run the following command:
 
-   The second step is to run:
+  .. code-block:: bash
 
-   .. code-block:: bash
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
 
-      docker build -t paddle:prod -f build/Dockerfile ./build
+In the above command, :code:`-it` means run the container interactively;
+:code:`-v $PWD:/work` means mount the current directory ($PWD will expand
+to current absolute path in Linux) under :code:`/work` in the container.
+:code:`paddlepaddle/paddle` to specify image to use; finnally
+:code:`/work/train.py` is the command to run inside docker.
 
-   The above command will generate the production image by copying the compiled PaddlePaddle program into the image.
+Also, you can go into the container shell, run or debug your code
+interactively:
 
-3. Run unit test
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
 
-   Following command will run unit test:
+**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.**
 
-   .. code-block:: bash
-      
-      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+.. _docker_run_book:
 
 PaddlePaddle Book
 ------------------
 
-The Jupyter Notebook is an open-source web application that allows
-you to create and share documents that contain live code, equations,
-visualizations and explanatory text in a single browser.
-
-PaddlePaddle Book is an interactive Jupyter Notebook for users and developers.
-We already exposed port 8888 for this book. If you want to
+You can create a container serving PaddlePaddle Book using Jupyter Notebook in
+one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook
+for users and developers.If you want to
 dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
 
 We provide a packaged book image, simply issue the command:
 
-.. code-block:: bash
+  .. code-block:: bash
 
-    docker run -p 8888:8888 paddlepaddle/book
+     docker run -p 8888:8888 paddlepaddle/book
 
 Then, you would back and paste the address into the local browser:
 
-.. code-block:: text
+  .. code-block:: text
 
-    http://localhost:8888/
+     http://localhost:8888/
 
 That's all. Enjoy your journey!
 
+.. _docker_run_gpu:
 
-Documentation
--------------
+Train with Docker with GPU
+------------------------------
 
-Paddle Docker images include an HTML version of C++ source code
-generated using `woboq code browser
-<https://github.com/woboq/woboq_codebrowser>`_.  This makes it easy
-for users to browse and understand the C++ source code.
+We recommend using
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_
+to run GPU training jobs. Please ensure you have latest
+GPU driver installed before move on.
 
-As long as we give the Paddle Docker container a name, we can run an
-additional Nginx Docker container to serve the volume from the Paddle
-container:
+  .. code-block:: bash
 
-.. code-block:: bash
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
 
-   docker run -d --name paddle-cpu-doc paddle:<version>
-   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
+**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
 
+  .. code-block:: bash
 
-Then we can direct our Web browser to the HTML version of source code
-at http://localhost:8088/paddle/
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+**About AVX:**
+
+AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
+The latest PaddlePaddle Docker image turns AVX on by default, so, if your
+computer doesn't support AVX, you'll probably need to
+`build <./build_from_source_en.html>`_ with :code:`WITH_AVX=OFF`.
+
+The following command will tell you whether your computer supports AVX.
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst
index dd9923697ab85825557aa89a08870bece7c76673..c9ba84c842b530162c92713046e64fdf82bd441b 100644
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -6,22 +6,28 @@
 安装流程
 ++++++++
 
-PaddlePaddle提供Docker镜像来部署环境。
+PaddlePaddle提供pip和Docker的安装方式：
 
 .. toctree::
    :maxdepth: 1
-   
-   docker_install_cn.rst 
 
+   pip_install_cn.rst
+   docker_install_cn.rst
+   ../../howto/dev/build_cn.md
 
 编译流程
 ++++++++
 
 ..  warning::
 
-    编译流程主要推荐高级用户查看，普通用户请走安装流程。
+    建议直接使用上述安装流程，方便快速安装。只有在遇到需要独立定制的二进制时才需要编译。
 
 ..  toctree::
     :maxdepth: 1
 
-    cmake/build_from_source_cn.rst
+    build_from_source_cn.rst
+
+常见问题解答
+++++++++++
+
+`常见问题解答 <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_cn.html>`_
diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/getstarted/build_and_install/index_en.rst
index 8a53588e0439df8f4d5fd529b7a20262c67d4e58..32d66d63dd5b2a30d5de4a088dc80b680830cb84 100644
--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
@@ -1,22 +1,34 @@
 Install and Build
 =================
 
-Install PaddlePaddle
-----------------------
+.. _install_steps:
 
-..  toctree::
-    :maxdepth: 1
+Install Steps
+++++++++
+
+You can choose either pip or Docker to complete your install:
+
+.. toctree::
+   :maxdepth: 1
+
+   pip_install_en.rst
+   docker_install_en.rst
+   ../../howto/dev/build_en.md
 
-    docker_install_en.rst
 
 Build from Source
 -----------------
 
 ..  warning::
 
-    Please use :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
+    We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary.
 
 ..  toctree::
     :maxdepth: 1
 
     build_from_source_en.md
+
+FAQ
+++++++++++
+
+`FAQ <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_en.html>`_
diff --git a/doc/getstarted/build_and_install/paddleci.png b/doc/getstarted/build_and_install/paddleci.png
new file mode 100644
index 0000000000000000000000000000000000000000..16087ce059aa3c07ce8c927d983eb86351915825
Binary files /dev/null and b/doc/getstarted/build_and_install/paddleci.png differ
diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0c741e936b46eda5e7165e4ee54b545b14a28a19
--- /dev/null
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@@ -0,0 +1,86 @@
+使用pip安装
+================================
+
+PaddlePaddle可以使用常用的Python包管理工具
+`pip <https://pip.pypa.io/en/stable/installing/>`_
+完成安装，并可以在大多数主流的Linux操作系统以及MacOS上执行。
+
+.. _pip_install:
+
+使用pip安装
+------------------------------
+
+
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件，版本为cpu_avx_openblas。
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+
+如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+如果需要获取并安装最新的（开发分支）PaddlePaddle，可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装，
+您可以从下面的表格中找到需要的版本：
+
+如果在点击下面链接时出现如下登陆界面，点击“Log in as guest”即可开始下载：
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: 各个版本最新的whl包
+    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API"
+    :widths: 1, 3, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+
+.. _pip_dependency:
+
+运行环境依赖
+------------------------------
+
+PaddlePaddle安装包由于不仅仅包含.py程序，而且包含了C++编写的部分，所以我们确保发布的二进制包可以支持主流的Linux操作系统，比如CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上。
+
+PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_ 标准，通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上，而且CentOS 5即将停止维护，所以我们默认使用CentOS 6作为标准编译环境。
+
+.. csv-table:: PaddlePaddle环境依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "操作系统", "Linux, MacOS", "CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上"
+   "Python", "2.7.x", "暂时不支持Python3"
+   "libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号"
+   "libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号"
+
+.. _pip_faq:
+
+安装常见问题和解决方法
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+  
+  出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip：
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  如果仍然存在问题，可以执行：
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包可以在 `这个 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 链接中找到。
+
+  如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新； 如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64 ，可以重命名这个whl包为 manylinux1_x86_64 再安装。
diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..285ed09805b09790beaef014f6813c227aff33ac
--- /dev/null
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@@ -0,0 +1,104 @@
+Install Using pip
+================================
+
+You can use current widely used Python package management
+tool `pip <https://pip.pypa.io/en/stable/installing/>`_
+to install PaddlePaddle. This method can be used in
+most of current Linux systems or MacOS.
+
+.. _pip_install:
+
+Install Using pip
+------------------------------
+
+Run the following command to install PaddlePaddle on the current
+machine, it will also download requirements, the version is cpu_avx_openblas.
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+
+If you wish to install GPU version (cuda7.5_cudnn5_avx_openblas), just run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+If you wish to install the latest develop branch PaddlePaddle, 
+you can download the latest whl package from our CI system. Access
+the below links, log in as guest, then click at the "Artifact"
+tab, you'll find the download link of whl packages.
+
+If the links below shows up the login form, just click "Log in as guest" to start the download:
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: whl package of each version
+    :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API"
+    :widths: 1, 3, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+
+.. _pip_dependency:
+
+Runtime Dependency
+------------------------------
+
+PaddlePaddle installation packages (whl) does not only contain .py files,
+but also binaries built from C++ code. We ensure that PaddlePaddle can
+run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04
+and MacOS 10.12.
+
+PaddlePaddle whl packages are trying to satisfy
+`manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_
+standard, which uses CentOS 5 as default build environment. But CUDA libraries
+seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime,
+so we use CentOS 6 as default build environment.
+
+.. csv-table:: PaddlePaddle Runtime Deps
+   :header: "Dependency", "version", "description"
+   :widths: 10, 15, 30
+
+   "OS", "Linux, MacOS", "CentOS 6 or later，Ubuntu 14.04 or later，MacOS 10.12 or later"
+   "Python", "2.7.x", "Currently Python3 is not supported"
+   "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols"
+   "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols"
+
+.. _pip_faq:
+
+FAQ
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+  
+  The main cause of this issue is that your current platform is
+  not supported. Please check that you are using Python 2.7 series.
+  Besides, pypi only supports manylinux1 standard, you'll need to
+  upgrade your pip to >9.0.0. Then run the below command:
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  If the problem still exists, run the following command:
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  Then you'll get supported package suffixes, then check if it matches
+  the file name of the whl package. You can find default whl package at
+  `here <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_
+
+  If your system supports linux_x86_64 but the whl package is manylinux1_x86_64,
+  you'll need to update pip to the latest version; If your system supports
+  manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the
+  file to manylinux1_x86_64 suffix and then install.
diff --git a/doc/getstarted/concepts/src/infer.py b/doc/getstarted/concepts/src/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee71cd7a9a4fbddb93fa3aa2d9349f01f3673982
--- /dev/null
+++ b/doc/getstarted/concepts/src/infer.py
@@ -0,0 +1,31 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import paddle.v2 as paddle
+import numpy as np
+
+paddle.init(use_gpu=False)
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
+y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+# loading the model which generated by training
+with open('params_pass_90.tar', 'r') as f:
+    parameters = paddle.parameters.Parameters.from_tar(f)
+
+# Input multiple sets of data，Output the infer result in a array.
+i = [[[1, 2]], [[3, 4]], [[5, 6]]]
+print paddle.infer(output_layer=y_predict, parameters=parameters, input=i)
+# Will print:
+# [[ -3.24491572]
+#  [ -6.94668722]
+#  [-10.64845848]]
diff --git a/doc/getstarted/concepts/src/train.py b/doc/getstarted/concepts/src/train.py
index 8aceb23406a476f08639cc6223cdf730b728a705..d9c0c66b8a7bbb66d7b66cce38220a4c62fd6849 100644
--- a/doc/getstarted/concepts/src/train.py
+++ b/doc/getstarted/concepts/src/train.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import paddle.v2 as paddle
 import numpy as np
 
@@ -26,6 +39,11 @@ def event_handler(event):
         if event.batch_id % 1 == 0:
             print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id,
                                                   event.cost)
+    # product model every 10 pass
+    if isinstance(event, paddle.event.EndPass):
+        if event.pass_id % 10 == 0:
+            with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
 
 
 # define training dataset reader
diff --git a/doc/getstarted/concepts/use_concepts_cn.rst b/doc/getstarted/concepts/use_concepts_cn.rst
index c243083794bb3c4659242de99b3b2715af9d7c24..e695ff283e2e806377a51c559b37e8068360a4ff 100644
--- a/doc/getstarted/concepts/use_concepts_cn.rst
+++ b/doc/getstarted/concepts/use_concepts_cn.rst
@@ -147,4 +147,9 @@ PaddlePaddle支持不同类型的输入数据，主要包括四种类型，和
 ..  literalinclude:: src/train.py
     :linenos:
 
+使用以上训练好的模型进行预测，取其中一个模型params_pass_90.tar，输入需要预测的向量组，然后打印输出：
+
+..  literalinclude:: src/infer.py
+    :linenos:
+
 有关线性回归的实际应用，可以参考PaddlePaddle book的 `第一章节 <http://book.paddlepaddle.org/index.html>`_。
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
index aa418c657a4ba16cce61c030066f4d3e14e891cc..9f6ee25987d51dcca3a37cf0f62a70a5a5a2d89a 100644
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -1,10 +1,61 @@
 新手入门
 ============
 
+.. _quick_install:
+
+快速安装
+++++++++
+
+PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
+执行下面的命令完成快速安装，版本为cpu_avx_openblas：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+更详细的安装和编译方法参考：
+
 ..  toctree::
   :maxdepth: 1
 
   build_and_install/index_cn.rst
-  concepts/use_concepts_cn.rst
 
-- `深度学习入门课程 <http://book.paddlepaddle.org/index.cn.html>`_
+.. _quick_start:
+
+快速开始
+++++++++
+
+创建一个 housing.py 并粘贴此Python代码：
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/use_concepts_cn.rst
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index be3253e3d41b99a2b696e2c5ef6463ed49680d69..063d9d880c82550f7f5d47d3d0b1fff59865bca7 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -1,9 +1,61 @@
 GET STARTED
 ============
 
+.. _quick_install:
+
+Quick Install
+----------------------
+
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install, the version is cpu_avx_openblas:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+For more details about installation and build:
+
 ..  toctree::
   :maxdepth: 1
 
   build_and_install/index_en.rst
 
-- `Deep Learning 101 <http://book.paddlepaddle.org/index.html>`_
+
+.. _quick_start:
+
+Quick Start
+++++++++
+
+Create a new file called housing.py, and paste this Python
+code:
+
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
diff --git a/doc/howto/dev/build_cn.md b/doc/howto/dev/build_cn.md
index 0b911f7b7509da4a147c65954acb7e7c38f489da..4a80a5245102fb992f513a749f6a02e1130188af 100644
--- a/doc/howto/dev/build_cn.md
+++ b/doc/howto/dev/build_cn.md
@@ -1,4 +1,4 @@
-# 编译PaddlePaddle和运行单元测试
+# 用Docker编译和测试PaddlePaddle
 
 ## 需要的软硬件
 
diff --git a/doc/howto/dev/build_en.md b/doc/howto/dev/build_en.md
index d0048e3714a5861a503736879d6c0870e5906c95..91c41ef8ce3abdec5d69a9cbcebbc49b17d8f663 100644
--- a/doc/howto/dev/build_en.md
+++ b/doc/howto/dev/build_en.md
@@ -1,4 +1,4 @@
-# Build PaddlePaddle from Source Code and Run Unit Test
+# Build using Docker
 
 ## What Developers Need
 
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md
index 699390145226ec2b65fdf5122db187e1d30d669e..3e0bf7b3973079a2063d33b6be4fe8a9dc5c07bb 100644
--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -76,18 +76,18 @@ no changes added to commit (use "git add" and/or "git commit -a")
 
 ## 构建和测试
 
-编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家，我们的标准开发流程是把这些工具都装进一个Docker image，称为*开发镜像*，通常名字是 `paddle:dev`。然后所有用 `cmake && make` 的地方（比如IDE配置里）都用 `docker run paddle:dev`来代替。
+编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家，我们的标准开发流程是把这些工具都装进一个Docker image，称为*开发镜像*，通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方（比如IDE配置里）都用 `docker run paddle:latest-dev`来代替。
 
 如要build这个开发镜像，在源码目录树的根目录中运行：
 
 ```bash
-➜  docker build -t paddle:dev .
+➜  docker build -t paddle:latest-dev .
 ```
 
 随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU，但是支持AVX指令集，并且包括unit tests的PaddlePaddle，可以：
 
 ```bash
-➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev
+➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev
 ```
 
 这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`，并且输出一个 `./build/paddle.deb`文件之外，还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*（`paddle:prod`）：
@@ -99,7 +99,7 @@ no changes added to commit (use "git add" and/or "git commit -a")
 如果要运行所有的单元测试，可以用如下命令：
 
 ```bash
-➜  docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+➜  docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
 ```
 
 关于构建和测试的更多信息，请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index 6cfc9536f20e88571a9845a50be0341fe4d9f78b..92996585674b46f45549b972b9f295503b1c7f8c 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -1,17 +1,18 @@
 # 如何写新的Operator
 
  - [概念简介](#概念简介)
- - [实现C++类](#实现C++类)
-   - [定义ProtoMaker类](#定义ProtoMaker类)
-   - [定义Operator类](#定义Operator类)
-   - [定义OpKernel类](#定义OpKernel类)
-   - [注册Operator](#注册Operator)
+ - [实现C++类](#实现c类)
+   - [定义ProtoMaker类](#定义protomaker类)
+   - [定义Operator类](#定义operator类)
+   - [定义OpKernel类](#定义opkernel类)
+   - [注册Operator](#注册operator)
    - [编译](#编译)
- - [绑定Python](#绑定Python)
+ - [绑定Python](#绑定python)
  - [实现单元测试](#实现单元测试)
-   - [前向Operator单测](#前向Operator单测)
-   - [反向Operator单测](#反向Operator单测)
+   - [前向Operator单测](#前向operator单测)
+   - [反向Operator单测](#反向operator单测)
    - [编译和执行](#编译和执行)
+ - [注意事项](#注意事项)
 
 
 ## 概念简介
@@ -23,15 +24,15 @@
 - `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
 - `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
 
-依据是否包含kernel，可以将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorBase`，后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
+依据是否包含kernel，可以将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorWithKernel`，后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
 
 
  内容            | 定义位置
 --------------  | :----------------------
 OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
 Op定义           | `.cc`文件
-Kernel实现       | CPU、GPU共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，GPU 实现在`.cu`文件中。
-注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，GPU实现在`.cu`文件中
+Kernel实现       | CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。
+注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中
 
 
 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
@@ -43,7 +44,7 @@ Kernel实现       | CPU、GPU共享Kernel实现在`.h`文件中，否则，CPU
 ## 实现C++类
 
 
-### 1. 定义ProtoMaker类
+### 定义ProtoMaker类
 
 矩阵乘法的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。
 
@@ -52,7 +53,7 @@ Kernel实现       | CPU、GPU共享Kernel实现在`.h`文件中，否则，CPU
 ```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor), 2D tensor of size (M x K)");
     AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
@@ -81,7 +82,7 @@ The equation is: Out = X * Y
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of scale operator.").NotInGradient();
     AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
@@ -100,7 +101,7 @@ The equation is: Out = scale*X
 - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
 
 
-### 2. 定义Operator类
+### 定义Operator类
 
 下面的点实现了MulOp的定义：
 
@@ -149,11 +150,11 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
 通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和下面将要介绍的注册函数一起放在`.cc`中
 
-### 3. 定义OpKernel类
+### 定义OpKernel类
 
 `MulKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
 
-- `typename  Place`: 表示设备类型，不同设备(CPU、GPU)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+- `typename DeviceContext`: 表示设备类型，不同设备(CPU、CUDA)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
 
 - `typename T` : 表示数据类型，如`float`, `double`等。
 
@@ -165,7 +166,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 下面是 `MulKernel` `Compute`的实现：
 
   ```cpp
-  template <typename Place, typename T>
+  template <typename DeviceContext, typename T>
   class MulKernel : public framework::OpKernel {
   public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -173,33 +174,32 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
     auto* Y = context.Input<Tensor>("Y");
     auto* Z = context.Output<Tensor>("Out");
     Z->mutable_data<T>(context.GetPlace());
-    auto* device_context =
-        const_cast<platform::DeviceContext*>(context.device_context_);
-    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+    auto& device_context = context.template device_context<DeviceContext>();
+    math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
   }
   };
   ```
 
-需要注意：**不同设备(CPU、GPU)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
+需要注意：**不同设备(CPU、CUDA)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
 
-`MulOp`的CPU、GPU实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
 
-为了使`OpKernel`的计算过程书写更加简单，并且CPU、GPU的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
+为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
 
 
 到此，前向Op实现完成。接下来，需要在`.cc`文件中注册该op和kernel。
 反向Op类的定义，反向OpKernel的定义与前向Op类似，这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
 
-### 4. 注册Operator
+### 注册Operator
 
 - 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
 
     ```cpp
     namespace ops = paddle::operators;
     REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
-    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
     REGISTER_OP_CPU_KERNEL(mul_grad,
-                  ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
     ```
 
    在上面的代码中：
@@ -209,20 +209,20 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
     - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
 
 
-- 在 `.cu`文件中注册GPU Kernel。
-    - 请注意，如果GPU Kernel的实现基于Eigen unsupported模块，那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`，代码示例如下：
+- 在 `.cu`文件中注册CUDA Kernel。
+    - 请注意，如果CUDA Kernel的实现基于Eigen unsupported模块，那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`，代码示例如下：
 
     ```cpp
     // if use Eigen unsupported module before include head files
-    // #define EIGEN_USE_GPU
+    #define EIGEN_USE_GPU
 
     namespace ops = paddle::operators;
-    REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
-    REGISTER_OP_GPU_KERNEL(mul_grad,
-                           ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+    REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+    REGISTER_OP_CUDA_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
     ```
 
-### 5. 编译
+### 编译
 
 运行下面命令可以进行编译：
 
@@ -236,71 +236,57 @@ make mul_op
 
 ## 实现单元测试
 
-单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
 
-### 前向Operator单元测试
+### 前向Operator单测
 
-前向Op单元测试继承自`unittest.TestCase`，并定义元类`__metaclass__ = OpTestMeta`。各项更加具体的单元测试在`OpTestMeta`里完成。测试前向Operator，需要：
+Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator，需要：
 
 1. 在`setUp`函数定义输入、输出，以及相关的属性参数。
 2. 生成随机的输入数据。
 3. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比。
+4. 反向计算已经自动集成进测试框架，直接调用相应接口即可。
 
 
   ```python
   import unittest
   import numpy as np
-  from gradient_checker import GradientChecker, create_op
-  from op_test_util import OpTestMeta
+  from op_test import OpTest
 
-  class TestMulOp(unittest.TestCase):
-      __metaclass__ = OpTestMeta
 
+  class TestMulOp(OpTest):
       def setUp(self):
-          self.type = "mul"
+          self.op_type = "mul"
           self.inputs = {
               'X': np.random.random((32, 84)).astype("float32"),
               'Y': np.random.random((84, 100)).astype("float32")
           }
           self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
-  ```
-
-上面的代码首先导入依赖的包，下面是对`setUp`函数中操作的重要变量的详细解释：
-
-- `self.type = "mul" ` : 定义类型，与operator注册时注册的类型一致。
-- `self.inputs` : 定义输入，类型为`numpy.array`，并初始化。
-- `self.outputs` : 定义输出，并在Python脚本中完成与operator同样的计算逻辑，返回Python端的计算结果。
 
+      def test_check_output(self):
+          self.check_output()
 
-### 反向Operator单元测试
+      def test_check_grad_normal(self):
+          self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
 
-反向Op单元测试继承自`GradientChecker`，而`GradientChecker`继承自`unittest.TestCase`，因此，**反向单元测试函数需要以`test_`开头**。
+      def test_check_grad_ingore_x(self):
+          self.check_grad(
+              ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
 
-```python
-class TestMulGradOp(GradientChecker):
-    def setUp(self):
-        self.op = create_op("mul")
-        self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((84, 100)).astype("float32")
-        }
-
-    def test_check_grad_normal(self):
-        # mul op will enlarge the relative error
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+      def test_check_grad_ingore_y(self):
+          self.check_grad(
+              ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+  ```
 
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+上面的代码首先导入依赖的包，下面是对`setUp`函数中操作的重要变量的详细解释：
 
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-```
+- `self.op_type = "mul" ` : 定义类型，与operator注册时注册的类型一致。
+- `self.inputs` : 定义输入，类型为`numpy.array`，并初始化。
+- `self.outputs` : 定义输出，并在Python脚本中完成与operator同样的计算逻辑，返回Python端的计算结果。
 
-下面解释代码中一些关键的地方:
+### 反向operator单测
 
-- 调用`create_op("mul")`创建反向Op对应的前向Op。
+而反向测试中：
 - `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
   - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
   - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
@@ -308,7 +294,7 @@ class TestMulGradOp(GradientChecker):
 - `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
 
 
-### 编译和执行单元测试
+### 编译和执行
 
 `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
 
@@ -328,5 +314,5 @@ ctest -R test_mul_op
 
 - 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。不允许一个文件中包含多个Op，这将会导致编译出错。
 - 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`等，这将会导致单元测试出错。
-- 如果Op没有实现GPU Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
+- 如果Op没有实现CUDA Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
 - 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md
index 1e88e1f5b4df710f1b69f0305d8d8a2921c4249a..7175d8370d6ce08c6d502eb42b8e53252db89bbb 100644
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@@ -1,8 +1,8 @@
 # How to write a new operator
 
  - [Background](#background)
- - [Implementing C++ Types](#implementing-c++-types)
-   - [Defining ProtoMaker](#defining-protoMaker)
+ - [Implementing C++ Types](#implementing-c-types)
+   - [Defining ProtoMaker](#defining-protomaker)
    - [Defining Operator](#defining-operator)
    - [Registering Operator](#registering-operator)
    - [Compilation](#compilation)
@@ -28,8 +28,8 @@ An operator can be differentiated by whether in has kernel methods. An operator
 --------------  | :----------------------
 OpProtoMake definition  | `.cc`files, Backward Op does not need an OpProtoMake interface.
 Op definition           | `.cc` files
-Kernel implementation       | The kernel methods shared between CPU and GPU are defined in `.h` files. CPU-specific kernels live in `.cc` files, while GPU-specific kernels are implemented in `.cu`files.
-Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the GPU implementation.
+Kernel implementation       | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
+Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
 
 
 New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. **
@@ -41,7 +41,7 @@ Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePadd
 ## Implementing C++ Types
 
 
-### 1. Defining Class ProtoMaker
+### Defining ProtoMaker
 
 Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output.
 
@@ -50,7 +50,7 @@ First, define `ProtoMaker` to describe the Operator's input, output, and additio
 ```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor), 2D tensor of size (M x K)");
     AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
@@ -79,7 +79,7 @@ An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/de
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of scale operator.").NotInGradient();
     AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
@@ -98,7 +98,7 @@ There are two changes in this example:
 - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);`  adds `scale`constant as an attribute, and sets the default value to 1.0.
 
 
-### 2. Defining Operator
+### Defining Operator
 
 The following code defines the interface for MulOp:
 
@@ -147,11 +147,11 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
 Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later.
 
-### 3. Defining OpKernel
+### Defining OpKernel
 
 `MulKernel` inherits `framework::OpKernel`, which includes the following templates:
 
-- `typename  Place` denotes device type. When different devices, namely the CPU and the GPU, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+- `typename  DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
 
 - `typename T` denotes data type, such as `float` or `double`.
 
@@ -163,7 +163,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
 `MulKernel`'s implementation of `Compute` is as follows:
 
   ```cpp
-  template <typename Place, typename T>
+  template <typename DeviceContext, typename T>
   class MulKernel : public framework::OpKernel {
   public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -171,16 +171,15 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
     auto* Y = context.Input<Tensor>("Y");
     auto* Z = context.Output<Tensor>("Out");
     Z->mutable_data<T>(context.GetPlace());
-    auto* device_context =
-        const_cast<platform::DeviceContext*>(context.device_context_);
-    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+    auto& device_context = context.template device_context<DeviceContext>();
+    math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
   }
   };
   ```
 
-Note that **different devices (CPU, GPU)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
+Note that **different devices (CPU, CUDA)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
 
-`MulOp`'s CPU and GPU share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
 
 To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
 
@@ -189,16 +188,16 @@ This concludes the forward implementation of an operator. Next its operation and
 
 The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.
 
-### 4. Registering Operator
+### Registering Operator
 
 - In `.cc` files, register forward and backward operator classes and the CPU kernel.
 
     ```cpp
     namespace ops = paddle::operators;
     REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
-    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
     REGISTER_OP_CPU_KERNEL(mul_grad,
-                  ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
     ```
 
    In that code block,
@@ -208,20 +207,20 @@ The definition of its corresponding backward operator, if applicable, is similar
     - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
 
 
-- Registering GPU Kernel in `.cu` files
-    - Note that if GPU Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
+- Registering CUDA Kernel in `.cu` files
+    - Note that if CUDA Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
 
     ```cpp
     // if use Eigen unsupported module before include head files
     #define EIGEN_USE_GPU
 
     namespace ops = paddle::operators;
-    REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
-    REGISTER_OP_GPU_KERNEL(mul_grad,
-                           ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+    REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+    REGISTER_OP_CUDA_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
     ```
 
-### 5. Compilation
+### Compilation
 
 Run the following commands to compile.
 
@@ -253,62 +252,51 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass
 
 2. Generating random input data.
 
-3. Implementing the same computation logic in a Python script:
+3. Implementing the same computation logic in a Python script.
+
+4. Call check gradient function to check the backward operator.
 
   ```python
   import unittest
   import numpy as np
-  from gradient_checker import GradientChecker, create_op
-  from op_test_util import OpTestMeta
+  from op_test import OpTest
 
-  class TestMulOp(unittest.TestCase):
-      __metaclass__ = OpTestMeta
 
+  class TestMulOp(OpTest):
       def setUp(self):
-          self.type = "mul"
+          self.op_type = "mul"
           self.inputs = {
               'X': np.random.random((32, 84)).astype("float32"),
               'Y': np.random.random((84, 100)).astype("float32")
           }
           self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+      def test_check_output(self):
+          self.check_output()
+          
+      def test_check_grad_normal(self):
+          self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+
+      def test_check_grad_ingore_x(self):
+          self.check_grad(
+              ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+      def test_check_grad_ingore_y(self):
+          self.check_grad(
+              ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
   ```
 Get its output, and compare it with the forward operator's own output.
 
 The code above first loads required packages. In addition, we have
 
-- `self.type = "mul" ` defines the type that is identical to what the operator's registered type.
+- `self.op_type = "mul" ` defines the type that is identical to what the operator's registered type.
 - `self.inputs` defines input, with type `numpy.array` and initializes it.
 - `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script.
 
 ### Testing Backward Operators
 
-A backward operator unit test inherits `GradientChecker`, which inherits `unittest.TestCase`. As a result, **a backward operator unit test needs to be have the prefix `test_`**.
-
-```python
-class TestMulGradOp(GradientChecker):
-    def setUp(self):
-        self.op = create_op("mul")
-        self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((84, 100)).astype("float32")
-        }
-
-    def test_check_grad_normal(self):
-        # mul op will enlarge the relative error
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-```
-
-Some key points in the code above include:
+Some key points in checking gradient above include:
 
-- `create_op("mul")` creates the backward operator's corresponding forward operator.
 - `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
   - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
   - The second variable `"Out"` points to the network's final output target `Out`.
@@ -338,5 +326,5 @@ ctest -R test_mul_op
 
 - Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file.
 - The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures.
-- If the operator does not implement a GPU kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
+- If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
 - If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`.
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst
index 61f3a223547b352cf7929615cf3682b29b9a738f..1bc947c260d7adb75ee5a2bb10e6b91bc0be2d4c 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -3,12 +3,64 @@
 ##################
 
 PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
-
+也可以利用PaddlePaddle 工具来编译文档，这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下
 
 如何构建文档
 ============
 
-PaddlePaddle的文档构建有两种方式。
+PaddlePaddle的文档构建有三种方式。
+
+
+使用PaddlePaddle.org工具
+--------------
+这个是目前推荐的使用方法。除了可以自动编译文档，也可以直接在网页预览文档。
+
+文件工具是使用Docker，需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
+之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+
+如果不想使用 Docker，你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+
+工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
+之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+
+想了解更多PaddlePaddle.org工具的详细信息，可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。
 
 使用Docker构建
 --------------
@@ -47,17 +99,12 @@ PaddlePaddle的文档构建有两种方式。
 
 PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程进行书写。
 
-如何更新文档主题
-================
-
-PaddlePaddle文档主题在 `TO_YOUR_PADDLE_CLONE_PATH/doc_theme` 文件夹下，包含所有和前端网页设计相关的文件。
-
-如何更新doc.paddlepaddle.org
+如何更新www.paddlepaddle.org
 ============================
 
-更新的文档以PR的形式提交到github中，提交方式参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
-目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://doc.paddlepaddle.org/develop/doc_cn/>`_ 和
-`英文文档 <http://doc.paddlepaddle.org/develop/doc/>`_ 。
+更新的文档以PR的形式提交到github中，提交方式参见 `贡献文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ 和
+`英文文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
 
 
 ..  _cmake: https://cmake.org/
diff --git a/doc/howto/dev/write_docs_en.rst b/doc/howto/dev/write_docs_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b3ef07eb1d0012827df8e6a4f27c5fa643649492
--- /dev/null
+++ b/doc/howto/dev/write_docs_en.rst
@@ -0,0 +1,80 @@
+##################
+Contribute Documentation
+##################
+
+PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
+Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
+When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content
+
+How to Build Documentations
+============
+
+We recommend using PaddlePaddle.org tool to build documentation
+
+
+Use PaddlePaddle.org tool
+--------------
+This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser.
+
+The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories. You may only clone the contents you need
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command
+Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+
+
+If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+
+Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+
+If you want to learn more on the PaddlePaddle.org, please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ 。
+
+How to write Documentations
+============
+
+PaddlePaddle uses `sphinx`_ to compile documentations，Please check sphinx official website for more detail.
+
+
+How to update www.paddlepaddle.org
+============================
+
+Please create PRs and submit them to github, please check `Contribute Code <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
+PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ and
+`English Docs <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
+
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index 76d3e0a0092f89005605a23e14e712530112a5ac..e0c69f7a6a4043abe999af6c8dd2555178b68424 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -9,9 +9,7 @@
 
   usage/cmd_parameter/index_cn.rst
   usage/cluster/cluster_train_cn.md
-  usage/k8s/k8s_basis_cn.md
-  usage/k8s/k8s_cn.md
-  usage/k8s/k8s_distributed_cn.md
+  usage/capi/index_cn.rst
 
 开发标准
 --------
@@ -19,7 +17,7 @@
 ..  toctree::
   :maxdepth: 1
 
-  dev/build_cn.rst
+  dev/contribute_to_paddle_cn.md
   dev/write_docs_cn.rst
 
 模型配置
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 1b6034be4edffd2cbc822018b733b9a3836ea84a..6d1bf7dfc003da6de31410ee0a7959233adfaf76 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -9,8 +9,6 @@ Usage
 
   usage/cmd_parameter/index_en.rst
   usage/cluster/cluster_train_en.md
-  usage/k8s/k8s_en.md
-  usage/k8s/k8s_aws_en.md
 
 Development
 ------------
@@ -18,9 +16,9 @@ Development
 ..  toctree::
   :maxdepth: 1
 
-  dev/build_en.rst
   dev/new_layer_en.rst
   dev/contribute_to_paddle_en.md
+  dev/write_docs_en.rst
 
 Configuration
 -------------
diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling.md
new file mode 100644
index 0000000000000000000000000000000000000000..1775374cf6e518586c28bbd8e04946c74df7e4c5
--- /dev/null
+++ b/doc/howto/optimization/cpu_profiling.md
@@ -0,0 +1,197 @@
+This tutorial introduces techniques we use to profile and tune the
+CPU performance of PaddlePaddle.  We will use Python packages
+`cProfile` and `yep`, and Google's `perftools`.
+
+Profiling is the process that reveals performance bottlenecks,
+which could be very different from what's in the developers' mind.
+Performance tuning is done to fix these bottlenecks. Performance optimization
+repeats the steps of profiling and tuning alternatively.
+
+PaddlePaddle users program AI applications by calling the Python API, which calls
+into `libpaddle.so.` written in C++.  In this tutorial, we focus on
+the profiling and tuning of
+
+1. the Python code and
+1. the mixture of Python and C++ code.
+
+## Profiling the Python Code
+
+### Generate the Performance Profiling File
+
+We can use Python standard
+package, [`cProfile`](https://docs.python.org/2/library/profile.html),
+to generate Python profiling file.  For example:
+
+```bash
+python -m cProfile -o profile.out main.py
+```
+
+where `main.py` is the program we are going to profile, `-o` specifies
+the output file.  Without `-o`, `cProfile` would outputs to standard
+output.
+
+### Look into the Profiling File
+
+`cProfile` generates `profile.out` after `main.py` completes. We can
+use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into
+the details:
+
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+
+where `-a` specifies the HTTP IP, `-p` specifies the port, `-f`
+specifies the profiling file, and `main.py` is the source file.
+
+Open the Web browser and points to the local IP and the specifies
+port, we will see the output like the following:
+
+```
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+
+where each line corresponds to Python function, and the meaning of
+each column is as follows:
+
+| column | meaning |
+| --- | --- |
+| ncalls | the number of calls into a function |
+| tottime | the total execution time of the function, not including the
+ execution time of other functions called by the function |
+| percall | tottime divided by ncalls |
+| cumtime | the total execution time of the function, including the execution time of other functions being called |
+| percall | cumtime divided by ncalls |
+| filename:lineno(function) | where the function is defined |
+
+### Identify Performance Bottlenecks
+
+Usually, `tottime` and the related `percall` time is what we want to
+focus on. We can sort above profiling file by tottime:
+
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+```
+
+We can see that the most time-consuming function is the `built-in
+method run`, which is a C++ function in `libpaddle.so`.  We will
+explain how to profile C++ code in the next section.  At this 
+moment, let's look into the third function `sync_with_cpp`, which is a
+Python function.  We can click it to understand more about it:
+
+```
+Called By:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+
+
+Called:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+
+The lists of the callers of `sync_with_cpp` might help us understand
+how to improve the function definition.
+
+## Profiling Python and C++ Code
+
+### Generate the Profiling File
+
+To profile a mixture of Python and C++ code, we can use a Python
+package, `yep`, that can work with Google's `perftools`, which is a
+commonly-used profiler for C/C++ code.
+
+In Ubuntu systems, we can install `yep` and `perftools` by running the
+following commands:
+
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+
+Then we can run the following command
+
+```bash
+python -m yep -v main.py
+```
+
+to generate the profiling file.  The default filename is
+`main.py.prof`.
+
+Please be aware of the `-v` command line option, which prints the
+analysis results after generating the profiling file.  By examining the
+ the print result, we'd know that if we stripped debug
+information from `libpaddle.so` at build time.  The following hints
+help make sure that the analysis results are readable:
+
+1. Use GCC command line option `-g` when building `libpaddle.so` so to
+   include the debug information.  The standard building system of
+   PaddlePaddle is CMake, so you might want to set
+   `CMAKE_BUILD_TYPE=RelWithDebInfo`.
+
+1. Use GCC command line option `-O2` or `-O3` to generate optimized
+   binary code. It doesn't make sense to profile `libpaddle.so`
+   without optimization, because it would anyway run slowly.
+
+1. Profiling the single-threaded binary file before the
+   multi-threading version, because the latter often generates tangled
+   profiling analysis result.  You might want to set environment
+   variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically
+   starting multiple threads.
+
+### Examining the Profiling File
+
+The tool we used to examine the profiling file generated by
+`perftools` is [`pprof`](https://github.com/google/pprof), which
+provides a Web-based GUI like `cprofilev`.
+
+We can rely on the standard Go toolchain to retrieve the source code
+of `pprof` and build it:
+
+```bash
+go get github.com/google/pprof
+```
+
+Then we can use it to profile `main.py.prof` generated in the previous
+section:
+
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+
+Where `-http` specifies the IP and port of the HTTP service.
+Directing our Web browser to the service, we would see something like
+the following:
+
+![result](./pprof_1.png)
+
+### Identifying the Performance Bottlenecks
+
+Similar to how we work with `cprofilev`, we'd focus on `tottime` and
+`cumtime`.
+
+![kernel_perf](./pprof_2.png)
+
+We can see that the execution time of multiplication and the computing
+of the gradient of multiplication takes 2% to 4% of the total running
+time, and `MomentumOp` takes about 17%. Obviously, we'd want to
+optimize `MomentumOp`.
+
+`pprof` would mark performance critical parts of the program in
+red. It's a good idea to follow the hints.
diff --git a/doc/howto/optimization/cpu_profiling_cn.md b/doc/howto/optimization/cpu_profiling_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..14eba0e2f34b115f5cd24920b5b1af07ec953d00
--- /dev/null
+++ b/doc/howto/optimization/cpu_profiling_cn.md
@@ -0,0 +1,155 @@
+此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优（performance tuning）。
+
+Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
+
+PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分:
+
+* Python 代码的性能分析
+* Python 与 C++ 混合代码的性能分析
+
+
+## Python代码的性能分析
+
+### 生成性能分析文件
+
+Python标准库中提供了性能分析的工具包，[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
+
+```bash
+python -m cProfile -o profile.out main.py
+```
+
+其中 `main.py` 是我们要分析的程序，`-o`标识了一个输出的文件名，用来存储本次性能分析的结果。如果不指定这个文件，`cProfile`会打印到标准输出。
+
+### 查看性能分析文件
+
+`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务，将性能分析结果以网页的形式展示出来：
+
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+
+其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+
+用Web浏览器访问对应网址，即可显示性能分析的结果：
+
+```
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+
+每一列的含义是:
+
+| 列名 | 含义 |
+| --- | --- |
+| ncalls | 函数的调用次数 |
+| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
+| percall | tottime的每次调用平均时间 |
+| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
+| percall | cumtime的每次调用平均时间 |
+| filename:lineno(function) | 文件名, 行号，函数名 |
+
+
+### 寻找性能瓶颈
+
+通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
+
+将性能分析结果按照tottime排序，效果如下:
+
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+```
+
+可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+
+```text
+Called By:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+
+
+Called:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+
+通常观察热点函数间的调用关系，和对应行的代码，就可以了解到问题代码在哪里。当我们做出性能修正后，再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+
+
+
+## Python与C++混合代码的性能分析
+
+### 生成性能分析文件
+
+C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
+
+使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
+
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+
+安装完毕后，我们可以通过
+
+```bash
+python -m yep -v main.py
+```
+
+生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+
+命令行中的`-v`指定在生成性能分析文件之后，在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同，编译时可能会去掉调试信息，运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果，可以采取下面几点措施:
+
+1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
+2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
+3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+
+### 查看性能分析文件
+
+在运行完性能分析后，会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意，这里使用了用`Go`语言重构后的`pprof`，因为这个工具具有web服务界面，且展示效果更好。
+
+安装`pprof`的命令和一般的`Go`程序是一样的，其命令如下:
+
+```bash
+go get github.com/google/pprof
+```
+
+进而我们可以使用如下命令开启一个HTTP服务:
+
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+
+这行命令中，`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径，进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
+
+访问对应的网址，我们可以查看性能分析的结果。结果如下图所示:
+
+![result](./pprof_1.png)
+
+
+### 寻找性能瓶颈
+
+与寻找Python代码的性能瓶颈类似，寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
+
+例如下图中，
+
+![kernel_perf](./pprof_2.png)
+
+在一次训练中，乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然，`MomentumOp`的性能有问题。
+
+在`pprof`中，对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题，再检查其他部分的性能问题，可以更有次序的完成性能的优化。
diff --git a/doc/howto/optimization/pprof_1.png b/doc/howto/optimization/pprof_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e9edbf377672d0ef40f2fc7bd39e746923550cb
Binary files /dev/null and b/doc/howto/optimization/pprof_1.png differ
diff --git a/doc/howto/optimization/pprof_2.png b/doc/howto/optimization/pprof_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..172ba20399ba974d27f4c072425277b69b02520b
Binary files /dev/null and b/doc/howto/optimization/pprof_2.png differ
diff --git a/doc/howto/read_source.md b/doc/howto/read_source.md
new file mode 100644
index 0000000000000000000000000000000000000000..31987920f32f217ac2db42548874cfe7da57dd72
--- /dev/null
+++ b/doc/howto/read_source.md
@@ -0,0 +1,67 @@
+# PaddlePaddle Fluid Source Code Overview
+
+Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/tests/book
+
+Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework
+
+Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators
+
+Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory
+
+Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform
+
+# Compile Time
+
+The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto).
+
+```python
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+```
+
+- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#)
+- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/layers)
+  - Every Layer has one or more operators and variables/parameters
+    - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files:
+      - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h)
+      - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) 
+      - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h)
+- Optimizer: `fluid.optimizer.SGD`. It does the following
+  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py)]
+  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py)]
+
+# Run Time
+
+The following **evaluates** the NN. Instantiates all the variables, operators.
+
+```python
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+# Allocate memory. Initialize Parameter.
+exe.run(fluid.default_startup_program())
+
+# Allocate memory. Do computation.
+exe.run(fluid.default_main_program(),
+        feed=feeder.feed(data),
+        fetch_list=[avg_cost])
+```
+
+- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h)
+  - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h)
+- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)]
+  - Feeds the data: `feed=feeder.feed(data)`
+  - Evaluates all the operators
+  - Fetches the result: `fetch_list=[avg_cost]`
+- Other worth looking files:
+  - Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live
+    - Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live
+      - Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory)
diff --git a/doc/howto/usage/capi/compile_paddle_lib_cn.md b/doc/howto/usage/capi/compile_paddle_lib_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..ac5ecffe2ea8ddc3703a32e9a0a8ee83bbe5dd14
--- /dev/null
+++ b/doc/howto/usage/capi/compile_paddle_lib_cn.md
@@ -0,0 +1,122 @@
+## 编译 PaddlePaddle 预测库
+
+### 概述
+
+使用 C-API 进行预测依赖于将 PaddlePaddle 核心代码编译成链接库，只需在编译时需配制下面这些编译选项：
+
+必须配置选项：
+- `WITH_C_API`，必须配置为`ON`。
+
+推荐配置选项：
+- `WITH_PYTHON`，推荐配置为`OFF`
+- `WITH_SWIG_PY`，推荐配置为`OFF`
+- `WITH_GOLANG`，推荐设置为`OFF`
+
+可选配置选项：
+- `WITH_GPU`，可配置为`ON/OFF`
+- `WITH_MKL`，可配置为`ON/OFF`
+
+对推荐配置中的选项建议按照设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码，配制编译选项（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
+
+```shell
+PADDLE_ROOT=/path/of/capi
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_GOLANG=OFF \
+      -DWITH_PYTHON=OFF \
+      -DWITH_MKL=OFF \
+      -DWITH_GPU=OFF  \
+      ..
+```
+
+执行上述代码生成Makefile文件后，执行：`make && make install`。成功编译后，使用C-API所需的依赖（包括：（1）编译出的PaddlePaddle预测库和头文件；（2）第三方链接库和头文件）均会存放于`PADDLE_ROOT`目录中。
+
+编译成功后在 `PADDLE_ROOT` 下会看到如下目录结构（包括了编译出的PaddlePaddle头文件和链接库，以及第三方依赖链接库和头文件（如果需要，由链接方式决定））：
+
+```text
+├── include
+│   └── paddle
+│       ├── arguments.h
+│       ├── capi.h
+│       ├── capi_private.h
+│       ├── config.h
+│       ├── error.h
+│       ├── gradient_machine.h
+│       ├── main.h
+│       ├── matrix.h
+│       ├── paddle_capi.map
+│       └── vector.h
+├── lib
+│   ├── libpaddle_capi_engine.a
+│   ├── libpaddle_capi_layers.a
+│   ├── libpaddle_capi_shared.so
+│   └── libpaddle_capi_whole.a
+└── third_party
+    ├── gflags
+    │   ├── include
+    │   │   └── gflags
+    │   │       ├── gflags_completions.h
+    │   │       ├── gflags_declare.h
+    │   │       ...
+    │   └── lib
+    │       └── libgflags.a
+    ├── glog
+    │   ├── include
+    │   │   └── glog
+    │   │       ├── config.h
+    │   │       ...
+    │   └── lib
+    │       └── libglog.a
+    ├── openblas
+    │   ├── include
+    │   │   ├── cblas.h
+    │   │   ...
+    │   └── lib
+    │       ...
+    ├── protobuf
+    │   ├── include
+    │   │   └── google
+    │   │       └── protobuf
+    │   │           ...
+    │   └── lib
+    │       └── libprotobuf-lite.a
+    └── zlib
+        ├── include
+        │   ...
+        └── lib
+            ...
+
+```
+
+### 链接说明
+
+目前提供三种链接方式：
+
+1. 链接`libpaddle_capi_shared.so` 动态库
+    - 使用 PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_shared.so`时，需注意：
+        1. 如果编译时指定编译CPU版本，且使用`OpenBLAS`数学库，在使用C-API开发预测程序时，只需要链接`libpaddle_capi_shared.so`这一个库。
+        1. 如果是用编译时指定CPU版本，且使用`MKL`数学库，由于`MKL`库有自己独立的动态库文件，在使用PaddlePaddle C-API开发预测程序时，需要自己链接MKL链接库。
+        1. 如果编译时指定编译GPU版本，CUDA相关库会在预测程序运行时动态装载，需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。
+    - 这种方式最为简便，链接相对容易，**在无特殊需求情况下，推荐使用此方式**。
+
+2. 链接静态库 `libpaddle_capi_whole.a`
+    - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时，需注意：
+        1. 需要指定`-Wl,--whole-archive`链接选项。
+        1. 需要显式地链接 `gflags`、`glog`、`libz`、`protobuf` 等第三方库，可在`PADDLE_ROOT/third_party`下找到。
+        1. 如果在编译 C-API 时使用OpenBLAS数学库，需要显示地链接`libopenblas.a`。
+        1. 如果在编译 C-API 是使用MKL数学库，需要显示地链接MKL的动态库。
+
+3. 链接静态库 `libpaddle_capi_layers.a`和`libpaddle_capi_engine.a`
+    - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时，需注意：
+        1. 这种链接方式主要用于移动端预测。
+        1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。
+        1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。
+        1. 第三方依赖库需要按照与方式2同样方法显示地进行链接。
diff --git a/doc/howto/usage/capi/images/csr.png b/doc/howto/usage/capi/images/csr.png
new file mode 100644
index 0000000000000000000000000000000000000000..3dc10b8de4f6d3f517624956b1694b689405a031
Binary files /dev/null and b/doc/howto/usage/capi/images/csr.png differ
diff --git a/doc/howto/usage/capi/images/sequence_data.png b/doc/howto/usage/capi/images/sequence_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..6e47a46b8955dfe977e85898fe3c9f33ed28de7e
Binary files /dev/null and b/doc/howto/usage/capi/images/sequence_data.png differ
diff --git a/doc/howto/usage/capi/images/workflow_of_CAPI.png b/doc/howto/usage/capi/images/workflow_of_CAPI.png
new file mode 100644
index 0000000000000000000000000000000000000000..a4399ade048b3fe10d2d9c714bc34333ca068edb
Binary files /dev/null and b/doc/howto/usage/capi/images/workflow_of_CAPI.png differ
diff --git a/doc/howto/usage/capi/index_cn.rst b/doc/howto/usage/capi/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fd774fbc742671c5a8009cb742f2c9d06a525199
--- /dev/null
+++ b/doc/howto/usage/capi/index_cn.rst
@@ -0,0 +1,9 @@
+PaddlePaddle C-API
+==================
+
+..  toctree::
+  :maxdepth: 1
+
+  compile_paddle_lib_cn.md
+  organization_of_the_inputs_cn.md
+  workflow_of_capi_cn.md
diff --git a/doc/howto/usage/capi/organization_of_the_inputs_cn.md b/doc/howto/usage/capi/organization_of_the_inputs_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..563ec5ca21ec5d75800fa201943d65e6d6fe51ea
--- /dev/null
+++ b/doc/howto/usage/capi/organization_of_the_inputs_cn.md
@@ -0,0 +1,285 @@
+## 输入/输出数据组织
+
+这篇文档介绍在使用 PaddlePaddle C-API 时如何组织输入数据，以及如何解析神经网络前向计算的输出结果。
+
+### 输入/输出数据类型
+在C-API中，按照基本数据类型在PaddlePaddle内部的定义和实现，输入数据可分为：
+1. 一维整型数组
+1. 二维浮点型矩阵
+    - 稠密矩阵
+    - 稀疏矩阵
+
+说明：
+1. 一维数组**仅支持整型值**；
+    - 常用于自然语言处理任务，例如：表示词语在词典中的序号；
+    - 分类任务中类别标签；
+1. 逻辑上高于二维的数据（例如含有多个通道的图片，视频等）在程序实现中都会转化为二维矩阵，转化方法在相应的领域都有通用解决方案，需要使用者自己了解并完成转化；
+1. 二维矩阵可以表示行向量和列向量，任何时候如果需要浮点型数组（向量），都应使用C-API中的矩阵来表示，而不是C-API中的一维数组。
+1. 不论是一维整型数组还是二维浮点数矩阵，**为它们附加上序列信息将变成序列输入。PaddlePaddle 会通过判数据是否附带有序列信息来判断一个向量/矩阵是否是一个序列**。当非序列输入时，无需关心和处理序列信息。关于什么是“序列信息”，下文会详细进行介绍。
+
+### 基本使用概念
+
+- 在PaddlePaddle内部，神经网络中一个计算层的输入/输出被组织为一个 `Argument` 结构体，如果神经网络有多个输入或者多个输入，每一个输入/输入都会对应有自己的`Argument`。
+- `Argument` 并不真正“存储”数据，而是将输入/输出信息有机地组织在一起。
+- 在`Argument`内部由`IVector`（对应着上文提到的一维整型数组）和`Matrix`（对应着上文提到的二维浮点型矩阵）来实际存储数据；由 `Sequence Start Positions` (下文详细解释) 来描述输入/输出的序列信息。
+
+- **注**：
+    1. 这篇文档之后部分将会统一使用`argument`来特指PaddlePaddle中神经网络计算层一个输入/输出数据。
+    1. 使用`paddle_ivector`来特指PaddlePaddle中的一维整型数组。
+    1. 使用`paddle_matrix`来特指PaddlePaddle中的二维浮点型矩阵。
+
+### 组织输入数据
+- 一维整型数组
+
+    概念上可以将`paddle_ivector`理解为一个一维的整型数组，通常用于表示离散的类别标签，或是在自然语言处理任务中表示词语在字典中的序号。下面的代码片段创建了含有三个元素`1`、`2`、`3`的`paddle_ivector`。
+    ```c
+    int ids[] = {1, 2, 3};
+     paddle_ivector ids_array =
+         paddle_ivector_create(ids, sizeof(ids) / sizeof(int), false, false);
+     CHECK(paddle_arguments_set_ids(in_args, 0, ids_array));
+    ```
+
+- **稠密矩阵**
+    - 一个`m×n`的稠密矩阵是一个由`m`行`n`列元素排列成的矩形阵列，矩阵里的元素是浮点数。对神经网络来说，矩阵的高度`m`是一次预测接受的样本数目，宽度$n$是神经网络定义时，`paddle.layer.data`的`size`。
+    - 下面的代码片段创建了一个高度为1，宽度为`layer_size`的稠密矩阵，矩阵中每个元素的值随机生成。
+
+    ```c
+    paddle_matrix mat = paddle_matrix_create(
+                            /* height = batch size */ 1,
+                            /* width = dimensionality of the data layer */ layer_size,
+                            /* whether to use GPU */ false);
+
+    paddle_real* array;
+    // Get the pointer pointing to the start address of the first row of the
+    // created matrix.
+    CHECK(paddle_matrix_get_row(mat, 0, &array));
+
+    // Fill the matrix with a randomly generated test sample.
+    srand(time(0));
+    for (int i = 0; i < layer_size; ++i) {
+      array[i] = rand() / ((float)RAND_MAX);
+    }
+
+    // Assign the matrix to the argument.
+    CHECK(paddle_arguments_set_value(in_args, 0, mat));
+    ```
+
+- **稀疏矩阵**
+
+  PaddlePaddle C-API 中 稀疏矩阵使用[CSR（Compressed Sparse Row Format）](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format))格式存储。下图是CSR存储稀疏矩阵的示意图。
+  <p align="center">
+  <img src="https://user-images.githubusercontent.com/5842774/34159369-009fd328-e504-11e7-9e08-36bc6dc5e505.png" width=700><br> 图1. 稀疏矩阵存储示意图
+  </p>
+
+  CSR存储格式通过：（1）非零元素的值（上图中的`values`）；（2）行偏移(上图中的`row offsets`)：每一行元素在`values`中的起始偏移，`row offsets`中元素个数总是等于行数 + 1；（3）非零元素的列号（上图中的`column indices`）来确定稀疏矩阵的内容。
+
+  在PaddlePaddle C-API中，通过调用以下接口创建稀疏矩阵：
+
+  ```c
+  PD_API paddle_matrix paddle_matrix_create_sparse(
+      uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
+  ```
+
+  1. 创建稀疏矩阵时需要显示地指定矩阵的（1）高度（`height`，在神经网络中等于一次预测处理的样本数）（2）宽度（`width`，`paddle.layer.data`的`size`）以及（3）非零元个数（`nnz`）。
+  1. 当上述接口第4个参数`isBinary`指定为`true`时，**只需要设置行偏移（`row_offset`）和列号(`colum indices`)，不需要提供元素值（`values`）**，这时行偏移和列号指定的元素默认其值为1。
+
+  下面的代码片段创建了一个CPU上的二值稀疏矩阵：
+
+  ```c
+  paddle_matrix mat = paddle_matrix_create_sparse(1, layer_size, nnz, true, false);
+  int colIndices[] = {9, 93, 109};  // layer_size here is greater than 109.
+  int rowOffset[] = {0, sizeof(colIndices) / sizeof(int)};
+
+  CHECK(paddle_matrix_sparse_copy_from(mat,
+                                 rowOffset,
+                                 sizeof(rowOffset) / sizeof(int),
+                                 colIndices,
+                                 (colIndices) / sizeof(int),
+                                 NULL /*values array is NULL.*/,
+                                 0 /*size of the value arrary is 0.*/));
+  CHECK(paddle_arguments_set_value(in_args, 0, mat));
+  ```
+  下面的代码片段在创建了一个CPU上的带元素值的稀疏矩阵：
+  ```c
+  paddle_matrix mat = paddle_matrix_create_sparse(1, layer_size, nnz, false, false);
+  int colIndices[] = {9, 93, 109};  // layer_size here is greater than 109.
+  int rowOffset[] = {0, sizeof(colIndices) / sizeof(int)};
+  float values[] = {0.5, 0.5, 0.5};
+
+  CHECK(paddle_matrix_sparse_copy_from(mat,
+                                 rowOffset,
+                                 sizeof(rowOffset) / sizeof(int),
+                                 colIndices,
+                                 sizeof(colIndices) / sizeof(int),
+                                 values,
+                                 sizeof(values) / sizeof(float)));
+  ```
+  注意事项：
+  1. 移动端预测**不支持**稀疏矩阵及相关的接口。
+
+### 组织序列信息
+
+多个排成一列的元素（可以是整型、浮点数、浮点数向量等）构成一个序列，元素之间的顺序是序列所携带的重要信息。不同序列可能会含有不同数目个元素。在 PaddlePaddle 中，序列输入/输出数据是在上文介绍的**数据输入（一维整型数组，二维浮点数矩阵）基础上，附加上序列信息**。下面详细解释什么是“序列信息”。
+
+我们将神经网络一次计算接受的所有输入样本称之为一个`batch`（可以含有一条或多条样本），每一个序列在整个`batch`中的偏移，就是PaddlePaddle中所指的**序列信息**，称之为“sequence start positions”。PaddlePaddle 支持两种序列类型：
+
+1. 单层序列
+    - 序列中的每一个元素是非序列，是进行计算的基本单位，不可再进行拆分。
+    - 例如：自然语言中的句子是一个序列，序列中的元素是词语；
+1. 双层序列
+    - 序列中的每一个元素又是一个序列。
+    - 例如：自然语言中的段落是一个双层序列；段落是由句子构成的序列；句子是由词语构成的序列。
+    - 双层序列在处理长序列的任务或是构建层级模型时会发挥作用。
+
+这篇文档之后部分会统一使用`sequence_start_positions`来特指：PaddlePaddle中神经网络计算层输入/输出所携带的序列信息。
+
+对双层序列来讲，不仅要提供每一个外层序列在整个`batch`中的偏移，每一个外层序列又含有若干个内层序列，需要同时提供每一个内层序列在整个`batch`中的偏移。也就是说：**双层序列需要设置分别为外层序列和内层序列分别设置`sequence_start_positions`信息**。
+
+**注：**
+1. 不论序列中的元素在内存中占用多少实际存储空间，`sequence_start_positions`表示的偏移是以“序列中的一个元素”作为统计的基本单位，而不是相对`batch`起始存储地址以数据的存储大小为单位的偏移。
+1. 非序列输入不携带`sequence_start_positions`，非序列输入无需构造`sequence_start_positions`。
+1. **不论是单层序列还是双层序列的序列信息，都使用`paddle_ivector`（也就是PaddlePaddle中的一维整型数组）来存储。**
+
+图2 是PaddlePaddle中单层序列和双层序列存储示意图。
+<p align="center">
+<img src="https://user-images.githubusercontent.com/5842774/34159714-1f81a9be-e505-11e7-8a8a-4902146ec899.png" width=800><br>图2. 序列输入示意图
+</p>
+
+- 单层序列
+
+    图2 (a) 展示了一个含有4个序列的`batch`输入：
+    1. 4个序列的长度分别为：5、3、2、4；
+    1. 这时的`sequence_start_positions`为：`[0, 5, 8, 10, 14]`；
+    1. 本地训练. 不论数据域是`paddle_ivector`类型还是`paddle_matrix`类型，都可以通过调用下面的接口为原有的数据输入附加上序列信息，使之变为一个单层序列输入，代码片段如下：
+
+    ```c
+    int seq_pos_array[] = {0, 5, 8, 10, 14};
+    paddle_ivector seq_pos = paddle_ivector_create(
+        seq_pos_array, sizeof(seq_pos_array) / sizeof(int), false, false);
+    // Suppose the network only has one input data layer.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos));
+    ```
+
+- 双层序列
+
+    图2 (b) 展示了一个含有4个序列的`batch`输入；
+    1. 4个序列的长度分别为：5、3、2、4；这四个序列又分别含有3、2、1、2个子序列；
+    1. 这时的需要同时提供：
+        - 外层序列在`batch`中的起始偏移`：[0, 5, 8, 10, 14]`；
+        - 内层序列在`batch`中的起始偏移：`[0, 2, 3, 5, 7， 8， 10， 13， 14]`；
+    1. 不论数据域是`paddle_ivector`类型还是`paddle_matrix`类型，这时需要调用创建序列信息和为`argument`设置序列信息的接口**两次**，分别为数据输入添加外层序列和内层序列的序列信息，使之变为一个双层序列输入，代码片段如下：
+    ```c
+    // set the sequence start positions for the outter sequences.
+    int outter_seq_pos_array[] = {0, 5, 8, 10, 14};
+    paddle_ivector seq_pos =
+        paddle_ivector_create(outter_seq_pos_array,
+                              sizeof(outter_pos_array) / sizeof(int),
+                              false,
+                              false);
+    // The third parameter of this API indicates the sequence level.
+    // 0 for the outter sequence. 1 for the inner sequence.
+    // If the input is a sequence not the nested sequence, the third parameter is
+    // fixed to be 0.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos));
+
+    // set the sequence start positions for the outter sequences.
+    int inner_seq_pos_array[] = {0, 2, 3, 5, 7， 8， 10， 13， 14};
+    paddle_ivector seq_pos = paddle_ivector_create(
+        inner_pos_array, sizeof(inner_pos_array) / sizeof(int), false, false);
+    // The third parameter of this API indicates the sequence level.
+    // 0 for the outter sequence. 1 for the inner sequence.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 1, seq_pos));
+    ```
+
+注意事项：
+1. 当一个`batch`中含有多个序列，**不支持序列长度为`0`的序列（也就是空输入）** 作为输入。不同计算层对空输入的处理策略有可能不同，潜在会引起未定义行为，或者引起行时错误，请在输入时进行合法性检查。
+
+### Python 端数据类型说明
+
+下表列出了Python端训练接口暴露的数据类型（`paddle.layer.data`函数`type`字段的取值）对应于调用C-API需要创建的数据类型：
+
+<html>
+<table border="2" frame="border">
+<table>
+<thead>
+<tr>
+<th style="text-align:left">Python 端数据类型</th>
+<th style="text-align:left">C-API 输入数据类型</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value</td>
+<td style="text-align:left">整型数组，无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector</td>
+<td style="text-align:left">浮点型稠密矩阵，无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector</td>
+<td style="text-align:left">浮点型稀疏矩阵，无需提供非零元的值，默认为1，无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector</td>
+<td style="text-align:left">浮点型稀疏矩阵，需提供非零元的值，无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value_sequence</td>
+<td style="text-align:left">整型数组，需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector_sequence</td>
+<td style="text-align:left">浮点型稠密矩阵，需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵，无需提供非零元的值，默认为1，需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵，需提供非零元的值，需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value_sub_sequence</td>
+<td style="text-align:left">整型数组，需附加双层序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector_sub_sequence</td>
+<td style="text-align:left">浮点型稠密矩阵，需附加双层序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector_sub_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵，无需提供非零元的值，默认为1，需附加双层序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector_sub_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵，需提供非零元的值，需附加双层序列信息</td>
+</tr>
+</tbody>
+</table>
+</html>
+<br>
+
+
+### 输出数据
+
+PaddlePaddle中一个计算层的输出数据组织方式和输入数据组织方式完全相同。一个输出数据同样被组织为一个`argument`，`argument`通过`paddle_matrix`或`paddle_ivector`存数数据，如果输出是一个序列，那么会携带有`sequence_start_positions`信息。调用C-API相关接口，读取需要的结果即可。
+
+### 总结
+
+- 在PaddlePaddle内部，神经网络中一个计算层的输入/输出被组织为`argument`。
+- `argument`并不真正“存储”数据，而是将输入/输出信息有机地组织在一起。
+- 在`argument`内部由`paddle_ivector`（一维整型数组）和`paddle_matrix`（二维浮点型矩阵）来实际存储数据。
+如果是一个序列输入/输出由 `sequence start positions` 来记录输入/输出的序列信息。
+
+于是，在组织神经网络输入时，需要思考完成以下工作：
+1. 为每一个输入/输出创建`argument`。
+    - C-API 中操作`argument`的接口请查看[argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h)。
+1. 为每一个`argument`创建`paddle_matrix`或者`paddle_ivector`来存储数据。
+    - C-API 中操作`paddle_ivector`的接口请查看 [vector.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/vector.h)。
+    - C-API 中操作`paddle_matrix`的接口请查看[matrix.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/matrix.h)。
+1. 如果输入是序列数据，需要创建并填写`sequence_start_positions`信息。
+    - 通过调用 [`paddle_arguments_set_sequence_start_pos`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h#L137) 来为一个`argument`添加序列信息。
+    - 通过调用 [`paddle_arguments_get_sequence_start_pos`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h#L150) 来读取一个`argument`添加序列信息。
+    - 接口说明请查看 [argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h) 文件。
diff --git a/doc/howto/usage/capi/workflow_of_capi_cn.md b/doc/howto/usage/capi/workflow_of_capi_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..e0a42fff12cf0f53dee18165e059150861524f74
--- /dev/null
+++ b/doc/howto/usage/capi/workflow_of_capi_cn.md
@@ -0,0 +1,119 @@
+## C-API 使用流程
+
+这篇文档介绍 PaddlePaddle C-API 整体使用流程。
+
+### 使用流程
+
+使用 C-API 的工作流程如图1所示，分为（1）准备预测模型和（2）预测程序开发两大部分。
+
+<p align="center">
+<img src="https://user-images.githubusercontent.com/5842774/34658453-365f73ea-f46a-11e7-9b3f-0fd112b27bae.png" width=500><br> 图1. C-API使用流程示意图
+</p>
+
+- 准备预测模型
+    1. 只将神经网络结构进行序列化。
+        - 只对神经网络结构进行序列化，加载模型需同时指定：网络结构的序列化结果和模型参数存储目录。
+    1. 将网络结构定义和训练结束存储下来的模型参数文件（多个）合并入一个文件。
+        - 神经网络模型结构和训练好的模型将被序列化合并入一个文件。
+        - 预测时只需加载一个文件便于发布。
+    - **注意**：以上两种方式只需选择其一即可。
+- 调用 C-API 开发预测序
+    1. 初始化PaddlePaddle运行环境。
+    1. 加载预测模型。
+    1. 创建神经网络输入，组织输入数据。
+    1. 进行前向计算，获得计算结果。
+    1. 清理和结束。
+
+### 准备预测模型
+
+准备预测模型部分，我们以手写数字识别任务为例进行介绍。手写数字识别任务定义了一个含有[两个隐层的简单全连接网络](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmax回归softmax-regression)，网络接受一幅图片作为输入，将图片分类到 0 ~ 9 类别标签之一。完整代码可以查看[此目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense) 中的相关脚本。
+
+调用C-API开发预测程序需要一个训练好的模型，运行[MNIST手写数字识别目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)下的[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本，在终端执行`python mnist_v2.py`，会使用 PaddlePaddle 内置的 [MNIST 数据集](http://yann.lecun.com/exdb/mnist/)进行训练。训练好的模型默认保存在当前运行目录下的`models`目录中。
+
+下面，我们将训练结束后存储下来的模型转换成预测模型。
+
+1. 序列化神经网络模型配置
+
+    PaddlePaddle 使用 protobuf 来传输网络配置文件中定义的网络结构和相关参数，使用 C-API 进行预测时，需要将网络结构使用 protobuf 进行序列化，写入文件中。
+
+    调用[`paddle.utils.dump_v2_config`](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/utils/dump_v2_config.py)中的`dump_v2_config`函数能够将使用 PaddlePaddle V2 API 定义的神经网络结构 dump 到指定文件中，示例代码如下：
+
+    ```python
+    from paddle.utils.dump_v2_config import dump_v2_config
+    from mnist_v2 import network
+
+    predict = network(is_infer=True)
+    dump_v2_config(predict, "trainer_config.bin", True)
+    ```
+
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例，[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本集成了序列化神经网络结构的过程，可以直接运行 `python mnist_v2.py --task dump_config` 对神经网络结构进行序列化，结果会写入当前运行目录下的`trainer_config.bin`文件中。
+
+    使用这种方式，需要**在运行时将神经网络的多个可学习参数放在同一个目录中**，C-API可以通过分别指定序列化后的网络结构文件和参数目录来加载训练好的模型。
+
+2. 合并模型文件(可选)
+
+    一些情况为了便于发布，希望能够将序列化后的神经网络结构和训练好的模型参数打包进一个文件。对于这样的需求，可以使用`paddle.utils.merge_model`中的`merge_v2_model`接口对神经网络结构和训练好的参数进行序列化，将序列化结果写入一个文件内。
+
+    代码示例如下：
+
+    ```python
+    from paddle.utils.merge_model import merge_v2_modelss
+    from mnist_v2 import network
+
+    net = network(is_infer=True)
+    param_file = "models/params_pass_4.tar"
+    output_file = "output.paddle.model"
+    merge_v2_model(net, param_file, output_file)
+    ```
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例，可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式，运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。
+
+#### 注意事项
+1. 为使用C-API，在调用`dump_v2_config`序列化神经网络结构时，参数`binary`必须指定为`True`。
+1. **预测使用的网络结构往往不同于训练**，通常需要去掉网络中的：（1）类别标签层；（2）损失函数层；（3）`evaluator`等，只留下核心计算层，请注意是否需要修改网络结构。
+1. 预测时，可以获取网络中定义的任意多个（大于等于一个）层前向计算的结果，需要哪些层的计算结果作为输出，就将这些层加入一个Python list中，作为调用`dump_v2_config`的第一个参数。
+
+### 编写预测代码
+
+预测代码更多详细示例代码请参考[C-API使用示例](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference) 目录下的代码示例。这一节对图1中预测代码编写的5个步骤进行介绍和说明。
+
+#### step 1. 初始化PaddlePaddle运行环境
+第一步需调用[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/main.h#L27) 初始化PaddlePaddle运行环境，该接口接受两个参数：参数的个数和参数列表。
+
+#### step2. 加载模型
+
+这里介绍C-API使用中的一个重要概念：Gradient Machine。
+
+概念上，在 PaddlePaddle 内部，一个GradientMachine类的对象管理着一组计算层（PaddlePaddle Layers）来完成前向和反向计算，并处理与之相关的所有细节。在调用C-API预测时，只需进行前向计算而无需调用反向计算。这篇文档之后部分会使用`gradient machine`来特指调用PaddlePaddle C-API创建的GradientMachine类的对象。每一个 `gradient machine` 都会管理维护一份训练好的模型，下面是C-API提供的，两种常用的模型加载方式：
+
+1. 调用[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L61)接口，从磁盘加载预测模型。这时`gradient machine`会独立拥有一份训练好的模型；
+1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L88)接口，与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时，通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/multi_thread/main.c)。
+
+- 注意事项
+    1. 使用PaddlePaddle V2 API训练，模型中所有可学习参数会被存为一个压缩文件，需要手动进行解压，将它们放在同一目录中，C-API不会直接加载 V2 API 存储的压缩文件。
+    1. 如果使用`merge model`方式将神经网络结构和训练好的参数序列化到一个文件，请参考此[示例](https://github.com/PaddlePaddle/Mobile/blob/develop/Demo/linux/paddle_image_recognizer.cpp#L59)。
+    1. 通过灵活使用以上两个接口，加载模型可其它多种方式，例如也可在程序运行过程中再加载另外一个模型。
+
+#### step 3. 创建神经网络输入，组织输入数据
+
+基本使用概念：
+- 在PaddlePaddle内部，神经网络中一个计算层的输入输出被组织为一个 `Argument` 结构体，如果神经网络有多个输入或者多个输出，每一个输入/输出都会对应有自己的`Argument`。
+- `Argument` 并不真正“存储”数据，而是将输入/输出数据有机地组织在一起。
+- 在`Argument`内部由：1. `Matrix`（二维矩阵，存储浮点类型输入/输出）；2. `IVector`（一维数组，**仅用于存储整型值**，多用于自然语言处理任务）来实际存储数据。
+
+C-API支持的所有输入数据类型和他们的组织方式，请参考“输入/输出数据组织”一节。
+
+这篇文档的之后部分会使用`argument`来特指PaddlePaddle C-API中神经网络的一个输入/输出，使用`paddle_matrix`**特指**`argument`中用于存储数据的`Matrix`类的对象。
+
+在组织神经网络输入，获取输出时，需要思考完成以下工作：
+1. 为每一个输入/输出创建`argument`；
+1. 为每一个`argument`创建`paddle_matrix`来存储数据；
+
+与输入不同的是，不需在使用C-API时为输出`argument`的`paddle_matrix`对象分配空间。前向计算之后PaddlePaddle内部已经分配/管理了每个计算层输出的存储空间。
+
+#### step 4. 前向计算
+
+完成上述准备之后，通过调用 [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L73) 接口完成神经网络的前向计算。
+
+#### step 5. 清理
+
+结束预测之后，对使用的中间变量和资源进行清理和释放。
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
index 2e98b3de3fe2284375f87e883ff4bac19255dbeb..c2fc86687d7106aac7c74d6dd16bc229353cb7c1 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -1,25 +1,8 @@
-# PaddlePaddle分布式训练
-
-* [概述](#概述)
-* [环境准备](#环境准备)
-* [启动参数说明](#启动参数说明)
-  * [启动参数服务器](#启动参数服务器)
-  * [启动计算节点](#启动计算节点)
-  * [准备数据集](#准备数据集)
-  * [准备训练程序](#准备训练程序)
-* [使用分布式计算平台或工具](#使用分布式计算平台或工具)
-  * [使用Fabric启动集群作业](#使用fabric启动集群作业)
-     * [准备一个Linux集群](#准备一个linux集群)
-     * [启动集群作业](#启动集群作业)
-     * [终止集群作业](#终止集群作业)
-     * [检查集群训练结果](#检查集群训练结果)
-     * [检查模型输出](#检查模型输出)
-  * [在OpenMPI集群中提交训练作业](#在openmpi集群中提交训练作业)
-     * [准备OpenMPI集群](#准备OpenMPI集群)
-     * [启动集群作业](#启动集群作业-1)
-  * [在Kubernetes集群中提交训练作业](#在kubernetes集群中提交训练作业)
+# 分布式训练
+
 
 ## 概述
+
 本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：
 
 <img src="https://user-images.githubusercontent.com/13348433/31772175-5f419eca-b511-11e7-9db7-5231fe3d9ccb.png" width="500">
@@ -32,10 +15,11 @@
 
 在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。
 
+
 ## 环境准备
 
 1. 准备您的计算集群。计算集群通常由一组（几台到几千台规模）的Linux服务器组成。服务器之间可以通过局域网（LAN）联通，每台服务器具有集群中唯一的IP地址（或者可被DNS解析的主机名）。集群中的每台计算机通常被成为一个“节点”。
-1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install)的多种安装方式。我们推荐使用[Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)安装方式来快速安装PaddlePaddle。
+1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
 
 安装完成之后，执行下面的命令可以查看已经安装的版本（docker安装方式可以进入docker容器执行：`docker run -it paddlepaddle/paddle:[tag] /bin/bash`）：
 ```bash
@@ -63,12 +47,12 @@ $ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradie
 $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
 ```
 
-| 参数  | 是否必选 | 默认值 | 说明 |
-| ------------- | ------------- | ------------- | ------------- |
-| port  | 必选 | 7164 | pserver监听的起始端口，根据ports_num决定<br>总端口个数，从起始端口监听多个端口用于通信  |
-| ports_num  | 必选 | 1 | 监听的端口个数  |
-| ports_num_for_sparse  | 必选 | 1 | 用于稀疏类型参数通信的端口个数  |
-| num_gradient_servers  | 必选 | 1 | 当前训练任务pserver总数 |
+参数说明
+
+- port：**必选，默认7164**，pserver监听的起始端口，根据ports_num决定总端口个数，从起始端口监听多个端口用于通信
+- ports_num：**必选，默认1**，监听的端口个数
+- ports_num_for_sparse：**必选，默认0**，用于稀疏类型参数通信的端口个数
+- num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
 
 ### 启动计算节点
 执行以下命令启动使用python编写的trainer程序（文件名为任意文件名，如train.py）
@@ -76,7 +60,7 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num
 $ python train.py
 ```
 
-trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过环境变量（https://zh.wikipedia.org/wiki/环境变量 ）或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量，将会优先使用`paddle.init()`中传入的参数。
+trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过[环境变量](https://zh.wikipedia.org/wiki/环境变量)或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量，将会优先使用`paddle.init()`中传入的参数。
 
 使用环境变量：
 
@@ -105,16 +89,16 @@ paddle.init(
         pservers="127.0.0.1")
 ```
 
-| 参数  | 是否必选 | 默认 | 说明 |
-| ------------- | ------------- | ------------- | ------------- |
-| use_gpu  | 可选 | False | 是否启用GPU训练 |
-| trainer_count  | 必选 | 1 | 当前训练任务trainer总个数 |
-| port  | 必选 | 7164 | 连接到pserver的端口  |
-| ports_num  | 必选 | 1 | 连接到pserver的端口个数  |
-| ports_num_for_sparse  | 必选 | 1 | 和pserver之间用于稀疏类型参数通信的端口个数  |
-| num_gradient_servers  | 必选 | 1 | 当前训练任务pserver总数 |
-| trainer_id  | 必选 | 0 | 每个trainer的唯一ID，从0开始的整数 |
-| pservers  | 必选 | 127.0.0.1 | 当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开 |
+参数说明
+
+- use_gpu： **可选，默认False**，是否启用GPU训练
+- trainer_count：**必选，默认1**，当前训练任务trainer总个数
+- port：**必选，默认7164**，连接到pserver的端口
+- ports_num：**必选，默认1**，连接到pserver的端口个数
+- ports_num_for_sparse：**必选，默认0**，和pserver之间用于稀疏类型参数通信的端口个数
+- num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
+- trainer_id：**必选，默认0**，每个trainer的唯一ID，从0开始的整数
+- pservers：**必选，默认127.0.0.1**，当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开
 
 
 ### 准备数据集
@@ -171,7 +155,7 @@ test.txt-00002
 
 - `my_lib.py`：会被`train.py`调用的一些用户定义的库函数，比如PIL库等。
 - `word_dict.pickle`：在`train.py`中会使用到的字典数据文件。
-- `train.py`：训练程序，代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py)。***注意：*** 对于本样例代码，在使用不同的分布式计算平台时，您可能需要修改`train.py`开头的部分（如下），以便获得训练数据的位置和获取环境变量配置：
+- `train.py`：训练程序，代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py)。***注意：*** 对于本样例代码，在使用不同的分布式计算平台时，您可能需要修改`train.py`开头的部分（如下），以便获得训练数据的位置和获取环境变量配置：
 
   ```python
   cluster_train_file = "./train_data_dir/train/train.txt"
@@ -195,91 +179,10 @@ PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务
 
 在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。
 
-### 使用Fabric启动集群作业
-
-#### 准备一个Linux集群
-可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下，执行`kubectl -f ssh_servers.yaml`启动一个测试集群，并使用`kubectl get po -o wide`获得这些节点的IP地址。
-
-#### 启动集群作业
-
-`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
-
-`paddle.py` 为方便作业启动提供了两个独特的命令选项。
-
--  `job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
--  `job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
-
-`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务，只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
-```
-sh run.sh
-```
-
-集群作业将会在几秒后启动。
-
-#### 终止集群作业
-`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
-
-#### 检查集群训练结果
-详细信息请检查 $workspace/log 里的日志，每一个节点都有相同的日志结构。
-
-`paddle_trainer.INFO`
-提供几乎所有训练的内部输出日志，与本地训练相同。这里检验运行时间模型的收敛。
-
-`paddle_pserver2.INFO`
-提供 pserver 运行日志，有助于诊断分布式错误。
-
-`server.log`
-提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
-
-`train.log`
-提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
-
-#### 检查模型输出
-运行完成后，模型文件将被写入节点 0 的 `output` 目录中。
-工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
-
-### 在OpenMPI集群中提交训练作业
-
-#### 准备OpenMPI集群
-
-执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点：
-
-```bash
-paddle/scripts/cluster_train_v2/openmpi/docker_cluster
-kubectl create -f head.yaml
-kubectl create -f mpi-nodes.yaml
-```
-
-然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。
-
-#### 启动集群作业
-
-您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务：
-
-```bash
-# 获得head和node节点的IP地址
-kubectl get po -o wide
-# 将node节点的IP地址保存到machines文件中
-kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
-# 拷贝必要的文件到head节点
-scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
-# ssh 登录到head节点
-ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
-# --------------- 以下操作均在head节点中执行 ---------------
-# 准备训练数据
-python prepare.py
-# 拷贝训练程序和字典文件到每台MPI节点
-cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
-# 创建日志目录
-mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
-# 拷贝训练数据到各自的节点
-scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
-scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
-scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
-# 启动训练任务
-mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
-```
-
-### 在Kubernetes集群中提交训练作业
+## 在不同集群中运行
 
-此部分的使用方法可以参考[here](../k8s/k8s_distributed_cn.md)。
+  - [fabric集群](fabric_cn.md)
+  - [openmpi集群](openmpi_cn.md)
+  - [kubernetes单机](k8s_cn.md)
+  - [kubernetes distributed分布式](k8s_distributed_cn.md)
+  - [AWS上运行kubernetes集群训练](k8s_aws_cn.md)
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
index baa97c0c02ae490fff8587071bd2d4adfb5325e3..28cd1fa7903e559e33a7fc2f00172fdfbe2fdc97 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -1,23 +1,4 @@
-# PaddlePaddle Distributed Training
-
-* [Introduction](#introduction)
-* [Preparations](#preparations)
-* [Command-line arguments](#command-line-arguments)
-   * [Starting parameter server](#starting-parameter-server)
-   * [Starting trainer](#starting-trainer)
-   * [Prepare Training Dataset](#prepare-training-dataset)
-   * [Prepare Training program](#prepare-training-program)
-* [Use cluster platforms or cluster management tools](#use-cluster-platforms-or-cluster-management-tools)
-   * [Cluster Training Using Fabric](#cluster-training-using-fabric)
-      * [Prepare a Linux cluster](#prepare-a-linux-cluster)
-      * [Launching Cluster Job](#launching-cluster-job)
-      * [Kill Cluster Job](#kill-cluster-job)
-      * [Check Cluster Training Result](#check-cluster-training-result)
-      * [Check Model Output](#check-model-output)
-   * [Cluster Training Using OpenMPI](#cluster-training-using-openmpi)
-      * [Prepare an OpenMPI cluster](#prepare-an-openmpi-cluster)
-      * [Launching Cluster Job](#launching-cluster-job-1)
-   * [Cluster Training Using Kubernetes](#cluster-training-using-kubernetes)
+# Distributed Training
 
 ## Introduction
 
@@ -35,7 +16,7 @@ When training with synchronize SGD, PaddlePaddle uses an internal "synchronize b
 
 ## Preparations
 1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
-2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install) document. We strongly recommend using [Docker installation](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).
+2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
 
 After installation, you can check the version by typing the below command (run a docker container  if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
 
@@ -67,12 +48,12 @@ If you wish to run parameter servers in background, and save a log file, you can
 $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
 ```
 
-| param  | required | default | description |
-| ------------- | ------------- | ------------- | ------------- |
-| port  | required | 7164 | port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput |
-| ports_num  | required | 1 | total number of ports will listen on  |
-| ports_num_for_sparse  | required | 1 | number of ports which serves sparse parameter update  |
-| num_gradient_servers  | required | 1 | total number of gradient servers |
+Parameter Description
+
+- port: **required, default 7164**, port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput.
+- ports_num: **required, default 1**, total number of ports will listen on.
+- ports_num_for_sparse: **required, default 0**, number of ports which serves sparse parameter update.
+- num_gradient_servers: **required, default 1**, total number of gradient servers.
 
 ### Starting trainer
 Type the command below to start the trainer(name the file whatever you want, like "train.py")
@@ -111,16 +92,16 @@ paddle.init(
         pservers="127.0.0.1")
 ```
 
-| param  | required | default | description |
-| ------------- | ------------- | ------------- | ------------- |
-| use_gpu  | optional | False | set to "True" to enable GPU training |
-| trainer_count  | required | 1 | total count of trainers in the training job |
-| port  | required | 7164 | port to connect to parameter server  |
-| ports_num  | required | 1 | number of ports for communication |
-| ports_num_for_sparse  | required | 1 | number of ports for sparse type caculation |
-| num_gradient_servers  | required | 1 | total number of gradient server |
-| trainer_id  | required | 0 | ID for every trainer, start from 0 |
-| pservers  | required | 127.0.0.1 | list of IPs of parameter servers, separated by "," |
+Parameter Description
+
+- use_gpu: **optional, default False**, set to "True" to enable GPU training.
+- trainer_count: **required, default 1**, total count of trainers in the training job.
+- port: **required, default 7164**, port to connect to parameter server.
+- ports_num: **required, default 1**, number of ports for communication.
+- ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
+- num_gradient_servers: **required, default 1**, total number of gradient server.
+- trainer_id: **required, default 0**, ID for every trainer, start from 0.
+- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
 
 ### Prepare Training Dataset
 
@@ -178,7 +159,7 @@ Your workspace may looks like:
 
 - `my_lib.py`: user defined libraries, like PIL libs. This is optional.
 - `word_dict.pickle`: dict file for training word embeding.
-- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables:
+- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables:
 
   ```python
   cluster_train_file = "./train_data_dir/train/train.txt"
@@ -202,92 +183,9 @@ We'll introduce cluster job management on these platforms. The examples can be f
 
 These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
 
-### Cluster Training Using Fabric
-
-#### Prepare a Linux cluster
-
-Run `kubectl -f ssh_servers.yaml` under the directory:  `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
-
-#### Launching Cluster Job
-`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
-
-`paddle.py`provides two distinguished command option for easy job launching.
-
-- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
-- `job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
-dispatch latency.
-
-`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
-```
-sh run.sh
-```
-
-The cluster Job will start in several seconds.
-
-#### Kill Cluster Job
-`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
-
-#### Check Cluster Training Result
-Check log in $workspace/log for details, each node owns same log structure.
-
-`paddle_trainer.INFO`
-It provides almost all internal output log for training,  same as local training. Check runtime model convergence here.
-
-`paddle_pserver2.INFO`
-It provides parameter server running log, which could help to diagnose distributed error.
-
-`server.log`
-It provides stderr and stdout of parameter server process. Check error log if training crashes.
-
-`train.log`
-It provides stderr and stdout of trainer process. Check error log if training crashes.
-
-#### Check Model Output
-After one pass finished, model files will be written in `output` directory in node 0.
-`nodefile` in workspace indicates the node id of current cluster job.
-
-### Cluster Training Using OpenMPI
-
-#### Prepare an OpenMPI cluster
-
-Run the following command to start a 3-node MPI cluster and one "head" node.
-
-```bash
-cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster
-kubectl create -f head.yaml
-kubectl create -f mpi-nodes.yaml
-```
-
-Then you can log in to every OpenMPI node using ssh without input any passwords.
-
-#### Launching Cluster Job
-
-Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
-
-```bash
-# find out node IP addresses
-kubectl get po -o wide
-# generate a "machines" file containing node IP addresses
-kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
-# copy necessary files onto "head" node
-scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
-# login to head node using ssh
-ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
-# --------------- in head node ---------------
-# prepare training data
-python prepare.py
-# copy training data and dict file to MPI nodes
-cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
-# creat a directory for storing log files
-mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
-# copy training data to every node
-scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
-scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
-scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
-# start the job
-mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
-```
-
-### Cluster Training Using Kubernetes
+## Use different clusters
 
-The details can be found [here](../k8s/k8s_cn.md)
+  - [fabric](fabric_en.md)
+  - [openmpi](openmpi_en.md)
+  - [kubernetes](k8s_en.md)
+  - [kubernetes on AWS](k8s_aws_en.md)
diff --git a/doc/howto/usage/cluster/fabric_cn.md b/doc/howto/usage/cluster/fabric_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..0385e401b399a51fad112e604dc56cb2f84c0a4b
--- /dev/null
+++ b/doc/howto/usage/cluster/fabric_cn.md
@@ -0,0 +1,42 @@
+# 使用fabric启动集群训练
+
+## 准备一个Linux集群
+可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下，执行`kubectl -f ssh_servers.yaml`启动一个测试集群，并使用`kubectl get po -o wide`获得这些节点的IP地址。
+
+## 启动集群作业
+
+`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
+
+`paddle.py` 为方便作业启动提供了两个独特的命令选项。
+
+-  `job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
+-  `job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
+
+`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务，只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
+```
+sh run.sh
+```
+
+集群作业将会在几秒后启动。
+
+## 终止集群作业
+`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
+
+## 检查集群训练结果
+详细信息请检查 $workspace/log 里的日志，每一个节点都有相同的日志结构。
+
+`paddle_trainer.INFO`
+提供几乎所有训练的内部输出日志，与本地训练相同。这里检验运行时间模型的收敛。
+
+`paddle_pserver2.INFO`
+提供 pserver 运行日志，有助于诊断分布式错误。
+
+`server.log`
+提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+`train.log`
+提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+## 检查模型输出
+运行完成后，模型文件将被写入节点 0 的 `output` 目录中。
+工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
diff --git a/doc/howto/usage/cluster/fabric_en.md b/doc/howto/usage/cluster/fabric_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..bf270d89ab8514801ca4629cf412f73257429df9
--- /dev/null
+++ b/doc/howto/usage/cluster/fabric_en.md
@@ -0,0 +1,43 @@
+# Cluster Training Using Fabric
+
+## Prepare a Linux cluster
+
+Run `kubectl -f ssh_servers.yaml` under the directory:  `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
+
+## Launching Cluster Job
+`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
+
+`paddle.py`provides two distinguished command option for easy job launching.
+
+- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
+- `job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
+dispatch latency.
+
+`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
+```
+sh run.sh
+```
+
+The cluster Job will start in several seconds.
+
+## Kill Cluster Job
+`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
+
+## Check Cluster Training Result
+Check log in $workspace/log for details, each node owns same log structure.
+
+`paddle_trainer.INFO`
+It provides almost all internal output log for training,  same as local training. Check runtime model convergence here.
+
+`paddle_pserver2.INFO`
+It provides parameter server running log, which could help to diagnose distributed error.
+
+`server.log`
+It provides stderr and stdout of parameter server process. Check error log if training crashes.
+
+`train.log`
+It provides stderr and stdout of trainer process. Check error log if training crashes.
+
+## Check Model Output
+After one pass finished, model files will be written in `output` directory in node 0.
+`nodefile` in workspace indicates the node id of current cluster job.
diff --git a/doc/howto/usage/cluster/k8s_aws_cn.md b/doc/howto/usage/cluster/k8s_aws_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..c44cd9a731bed7067cdf19aa2f714abdce6c736a
--- /dev/null
+++ b/doc/howto/usage/cluster/k8s_aws_cn.md
@@ -0,0 +1 @@
+k8s_aws_en.md
\ No newline at end of file
diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/cluster/k8s_aws_en.md
similarity index 98%
rename from doc/howto/usage/k8s/k8s_aws_en.md
rename to doc/howto/usage/cluster/k8s_aws_en.md
index ce72b0803818d5bf0c18753c421848cf2fc1b668..0dfa8237a3fa2c9c3ee11e873c9fbbed3cd6018f 100644
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/cluster/k8s_aws_en.md
@@ -493,7 +493,7 @@ spec:
     spec:
       containers:
       - name: paddle-data
-        image: paddledev/paddle-tutorial:k8s_data
+        image: paddlepaddle/paddle-tutorial:k8s_data
         imagePullPolicy: Always
         volumeMounts:
         - mountPath: "/efs"
@@ -522,7 +522,7 @@ NAME          DESIRED   SUCCESSFUL   AGE
 paddle-data   1         1            6m
 ```
 
-Data preparation is done by docker image `paddledev/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code.
+Data preparation is done by docker image `paddlepaddle/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code.
 
 #### Start Training
 
@@ -545,7 +545,7 @@ spec:
           claimName: efsvol
       containers:
       - name: trainer
-        image: paddledev/paddle-tutorial:k8s_train
+        image: paddlepaddle/paddle-tutorial:k8s_train
         command: ["bin/bash",  "-c", "/root/start.sh"]
         env:
         - name: JOB_NAME
@@ -617,7 +617,7 @@ kubectl --kubeconfig=kubeconfig log -f POD_NAME
 
 Run `kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job` to check training job status. It will complete in around 20 minutes.
 
-The details for start `pserver` and `trainer` are hidden inside docker image `paddledev/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code.
+The details for start `pserver` and `trainer` are hidden inside docker image `paddlepaddle/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code.
 
 #### Inspect Training Output
 
diff --git a/doc/howto/usage/k8s/k8s_cn.md b/doc/howto/usage/cluster/k8s_cn.md
similarity index 83%
rename from doc/howto/usage/k8s/k8s_cn.md
rename to doc/howto/usage/cluster/k8s_cn.md
index ab07cb9cd5b135ddea82b3360720537f1dc5a801..c1a11f7165a2f9da9dd044641274447e7943a597 100644
--- a/doc/howto/usage/k8s/k8s_cn.md
+++ b/doc/howto/usage/cluster/k8s_cn.md
@@ -1,21 +1,22 @@
 # Kubernetes单机训练
 
-在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
+在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的PaddlePaddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
 
 ## 制作Docker镜像
 
-在一个功能齐全的Kubernetes机群里，通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话，一个分布式Paddle训练任务中的每个进程都可以从Ceph读取数据。在这个例子里，我们只演示一个单机作业，所以可以简化对环境的要求，把训练数据直接放在
-Paddle的Docker image里。为此，我们需要制作一个包含训练数据的Paddle镜像。
+在一个功能齐全的Kubernetes机群里，通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话，一个分布式PaddlePaddle训练任务中
+的每个进程都可以从Ceph读取数据。在这个例子里，我们只演示一个单机作业，所以可以简化对环境的要求，把训练数据直接放在
+PaddlePaddle的Docker Image里。为此，我们需要制作一个包含训练数据的PaddlePaddle镜像。
+
+PaddlePaddle的 `paddlepaddle/paddle:cpu-demo-latest` 镜像里有PaddlePaddle的源码与demo，
+（请注意，默认的PaddlePaddle生产环境镜像 `paddlepaddle/paddle:latest` 是不包括源码的，PaddlePaddle的各版本镜像可以参考
+[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)），
+下面我们使用这个镜像来下载数据到Docker Container中，并把这个包含了训练数据的Container保存为一个新的镜像。
 
-Paddle 的 [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) 
-里介绍了用Paddle源码中的脚本下载训练数据的过程。
-而 `paddledev/paddle:cpu-demo-latest` 镜像里有 Paddle 源码与demo，（ 请注意，默认的
-Paddle镜像 `paddledev/paddle:cpu-latest` 是不包括源码的, Paddle的各版本镜像可以参考 [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html) ），所以我们使用这个镜像来下载训练数据到Docker container中，然后把这个包含了训练数据的container保存为一个新的镜像。
-  
 ### 运行容器
 
 ```
-$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest
 ```
 
 ### 下载数据
@@ -103,7 +104,7 @@ spec:
       restartPolicy: Never
 ```
 
-### 创建Paddle Job
+### 创建PaddlePaddle Job
 
 使用上文创建的yaml文件创建Kubernetes Job，命令为：
 
diff --git a/doc/howto/usage/k8s/k8s_distributed_cn.md b/doc/howto/usage/cluster/k8s_distributed_cn.md
similarity index 88%
rename from doc/howto/usage/k8s/k8s_distributed_cn.md
rename to doc/howto/usage/cluster/k8s_distributed_cn.md
index a9bebf09558b06993119803458977abedbbfbdd0..167089b8074b33e3b094fa3ec8e377630cec42ac 100644
--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/cluster/k8s_distributed_cn.md
@@ -1,8 +1,6 @@
 # Kubernetes分布式训练
 
-前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
-
-有关Kubernetes相关概念以及如何搭建和配置Kubernetes集群，可以参考[k8s_basis](./k8s_basis_cn.md)。
+前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cluster/cluster_train_cn.html)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
 
 ## 整体方案
 
@@ -28,7 +26,7 @@ PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行
 - 拷贝训练文件到容器内
 - 生成`paddle pserver`与`paddle train`进程的启动参数，并且启动训练
 
-因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/src/k8s_train/Dockerfile)。
+因为官方镜像 `paddlepaddle/paddle:latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile)。
 
 ```bash
 $ cd doc/howto/usage/k8s/src/k8s_train
@@ -62,7 +60,7 @@ spec:
       hostNetwork: true
       containers:
       - name: paddle-data
-        image: paddledev/paddle-tutorial:k8s_data
+        image: paddlepaddle/paddle-tutorial:k8s_data
         imagePullPolicy: Always
         volumeMounts:
         - mountPath: "/mnt"
@@ -149,20 +147,19 @@ spec:
 
 文件中，`metadata`下的`name`表示这个job的名字。`parallelism，completions`字段表示这个job会同时开启3个PaddlePaddle节点，成功训练且退出的pod数目为3时，这个job才算成功结束。然后申明一个存储卷`jobpath`，代表宿主机目录`/home/work/mfs`，在对容器的描述`containers`字段中，将此目录挂载为容器的`/home/jobpath`目录，这样容器的`/home/jobpath`目录就成为了共享存储，放在这个目录里的文件其实是保存到了MFS上。
 
-`env`字段表示容器的环境变量，我们将`paddle`运行的一些参数通过这种方式传递到容器内。
+`env`字段表示容器的环境变量，我们将`paddle`运行的一些参数通过这种方式传递到容器内：
+
 
-环境变量 | 说明
---- | ---
-JOB_PATH | 共享存储挂在的路径
-JOB_NAME | Job的名字
-TRAIN_CONFIG_DIR | 本次训练文件所在目录，与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径
-CONF_PADDLE_NIC | `paddle pserver`进程需要的`--nics`参数，即网卡名
-CONF_PADDLE_PORT | `paddle paserver`的`--port`参数
-CONF_PADDLE_PORTS_NUM | 稠密更新的端口数量，即`--ports_num`参数
-CONF_PADDLE_PORTS_NUM_SPARSE | 稀疏更新的端口数量，即`--ports_num_for_sparse`参数
-CONF_PADDLE_GRADIENT_NUM | 训练节点数量，即`--num_gradient_servers参数`
+- JOB_PATH：共享存储挂在的路径
+- JOB_NAME：Job的名字
+- TRAIN_CONFIG_DIR：本次训练文件所在目录，与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径
+- CONF_PADDLE_NIC：`paddle pserver`进程需要的`--nics`参数，即网卡名
+- CONF_PADDLE_PORT：`paddle paserver`的`--port`参数
+- CONF_PADDLE_PORTS_NUM：稠密更新的端口数量，即`--ports_num`参数
+- CONF_PADDLE_PORTS_NUM_SPARSE：稀疏更新的端口数量，即`--ports_num_for_sparse`参数
+- CONF_PADDLE_GRADIENT_NUM：训练节点数量，即`--num_gradient_servers参数`
 
-这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/doc/ui/cmd_argument/detail_introduction.html#parameter-server-and-distributed-communication)。
+这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。
 
 编写完YAML文件后，可以使用Kubernetes的命令行工具创建job。
 
diff --git a/doc/howto/usage/k8s/k8s_en.md b/doc/howto/usage/cluster/k8s_en.md
similarity index 79%
rename from doc/howto/usage/k8s/k8s_en.md
rename to doc/howto/usage/cluster/k8s_en.md
index 0c3ab05b708e7a924577c26496b8c55126e76c62..c374f00a495d705ceddf8d3d930768ceeb93282b 100644
--- a/doc/howto/usage/k8s/k8s_en.md
+++ b/doc/howto/usage/cluster/k8s_en.md
@@ -1,18 +1,27 @@
-# Paddle On Kubernetes
+# PaddlePaddle On Kubernetes
 
->In this article, we will introduce how to run Paddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run Paddle training job on distributed cluster.
+In this article, we will introduce how to run PaddlePaddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run PaddlePaddle training job on distributed cluster.
 
 ## Build Docker Image
 
-In distributed Kubernetes cluster, we will use Ceph or other shared storage system for storing training related data so that all processes in Paddle training can retrieve data from Ceph. In this example, we will only demo training job on single machine. In order to simplify the requirement of the environment, we will directly put training data into Paddle's Docker Image, so we need to create a Paddle Docker image that already includes the training data.
+In distributed Kubernetes cluster, we will use Ceph or other distributed
+storage system for storing training related data so that all processes in
+PaddlePaddle training can retrieve data from Ceph. In this example, we will
+only demo training job on single machine. In order to simplify the requirement
+of the environment, we will directly put training data into the PaddlePaddle Docker Image,
+so we need to create a PaddlePaddle Docker image that includes the training data.
+
+The production Docker Image `paddlepaddle/paddle:cpu-demo-latest` has the PaddlePaddle
+source code and demo. (Caution: Default PaddlePaddle Docker Image `paddlepaddle/paddle:latest` doesn't include
+the source code, PaddlePaddle's different versions of Docker Image can be referred here:
+[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_en.html)),
+so we run this Docker Image and download the training data, and then commit the whole
+Container to be a new Docker Image.
 
-Paddle's [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) introduces how to download and train data by using script from Paddle's source code.
-And `paddledev/paddle:cpu-demo-latest` image has the Paddle source code and demo. (Caution: Default Paddle image `paddledev/paddle:cpu-latest` doesn't include the source code, Paddle's different versions of image can be referred here: [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html)), so we run this container and download the training data, and then commit the whole container to be a new Docker image.
-  
 ### Run Docker Container
 
 ```
-$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest
 ```
 
 ### Download Training Data
@@ -67,7 +76,7 @@ $ docker commit quick_start_data mypaddle/paddle:quickstart
 
 ## Use Kubernetes For Training
 
->We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
+We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
 
 ### Create Yaml Files
 
@@ -99,7 +108,7 @@ spec:
       restartPolicy: Never
 ```
 
-### Start Paddle Job
+### Start PaddlePaddle Job
 
 Using the above yaml file to start the Kubernetes job.
 
diff --git a/doc/howto/usage/cluster/openmpi_cn.md b/doc/howto/usage/cluster/openmpi_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..831cafdc03c6a908f31769d0467de022df42dab5
--- /dev/null
+++ b/doc/howto/usage/cluster/openmpi_cn.md
@@ -0,0 +1,41 @@
+# 在OpenMPI集群中提交训练作业
+
+## 准备OpenMPI集群
+
+执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点：
+
+```bash
+paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。
+
+## 启动集群作业
+
+您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务：
+
+```bash
+# 获得head和node节点的IP地址
+kubectl get po -o wide
+# 将node节点的IP地址保存到machines文件中
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# 拷贝必要的文件到head节点
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# ssh 登录到head节点
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- 以下操作均在head节点中执行 ---------------
+# 准备训练数据
+python prepare.py
+# 拷贝训练程序和字典文件到每台MPI节点
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# 创建日志目录
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# 拷贝训练数据到各自的节点
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# 启动训练任务
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
diff --git a/doc/howto/usage/cluster/openmpi_en.md b/doc/howto/usage/cluster/openmpi_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..09af46e25ebe1f843dc7c7be0997dc706413b65c
--- /dev/null
+++ b/doc/howto/usage/cluster/openmpi_en.md
@@ -0,0 +1,41 @@
+# Cluster Training Using OpenMPI
+
+## Prepare an OpenMPI cluster
+
+Run the following command to start a 3-node MPI cluster and one "head" node.
+
+```bash
+cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+Then you can log in to every OpenMPI node using ssh without input any passwords.
+
+## Launching Cluster Job
+
+Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
+
+```bash
+# find out node IP addresses
+kubectl get po -o wide
+# generate a "machines" file containing node IP addresses
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# copy necessary files onto "head" node
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# login to head node using ssh
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- in head node ---------------
+# prepare training data
+python prepare.py
+# copy training data and dict file to MPI nodes
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# creat a directory for storing log files
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# copy training data to every node
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# start the job
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
diff --git a/doc/howto/usage/k8s/src/Dockerfile b/doc/howto/usage/cluster/src/Dockerfile
similarity index 54%
rename from doc/howto/usage/k8s/src/Dockerfile
rename to doc/howto/usage/cluster/src/Dockerfile
index 3a73606c61432329b4cc2d2f8daadc5af8735c96..e178bf4da0f32fca9586b5b69a2c7419de5d9cb1 100644
--- a/doc/howto/usage/k8s/src/Dockerfile
+++ b/doc/howto/usage/cluster/src/Dockerfile
@@ -1,4 +1,4 @@
-FROM paddledev/paddle:cpu-latest
+FROM paddlepaddle/paddle:latest
 
 MAINTAINER zjsxzong89@gmail.com
 
diff --git a/doc/howto/usage/k8s/src/add_security_group.png b/doc/howto/usage/cluster/src/add_security_group.png
similarity index 100%
rename from doc/howto/usage/k8s/src/add_security_group.png
rename to doc/howto/usage/cluster/src/add_security_group.png
diff --git a/doc/howto/usage/k8s/src/create_efs.png b/doc/howto/usage/cluster/src/create_efs.png
similarity index 100%
rename from doc/howto/usage/k8s/src/create_efs.png
rename to doc/howto/usage/cluster/src/create_efs.png
diff --git a/doc/howto/usage/k8s/src/efs_mount.png b/doc/howto/usage/cluster/src/efs_mount.png
similarity index 100%
rename from doc/howto/usage/k8s/src/efs_mount.png
rename to doc/howto/usage/cluster/src/efs_mount.png
diff --git a/doc/howto/usage/cluster/src/k8s-paddle-arch.png b/doc/howto/usage/cluster/src/k8s-paddle-arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3800c4fe81302d35e49f7dbacb9221c4dfa5cde
Binary files /dev/null and b/doc/howto/usage/cluster/src/k8s-paddle-arch.png differ
diff --git a/doc/howto/usage/k8s/src/k8s_data/Dockerfile b/doc/howto/usage/cluster/src/k8s_data/Dockerfile
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_data/Dockerfile
rename to doc/howto/usage/cluster/src/k8s_data/Dockerfile
diff --git a/doc/howto/usage/k8s/src/k8s_data/README.md b/doc/howto/usage/cluster/src/k8s_data/README.md
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_data/README.md
rename to doc/howto/usage/cluster/src/k8s_data/README.md
diff --git a/doc/howto/usage/k8s/src/k8s_data/get_data.sh b/doc/howto/usage/cluster/src/k8s_data/get_data.sh
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_data/get_data.sh
rename to doc/howto/usage/cluster/src/k8s_data/get_data.sh
diff --git a/doc/howto/usage/k8s/src/k8s_train/Dockerfile b/doc/howto/usage/cluster/src/k8s_train/Dockerfile
similarity index 77%
rename from doc/howto/usage/k8s/src/k8s_train/Dockerfile
rename to doc/howto/usage/cluster/src/k8s_train/Dockerfile
index c0fca1f9a945921e6e8899fee2db8845e66136a1..77f021a89a70d934bf70424eaa3c6dc3f7c93a28 100644
--- a/doc/howto/usage/k8s/src/k8s_train/Dockerfile
+++ b/doc/howto/usage/cluster/src/k8s_train/Dockerfile
@@ -1,4 +1,4 @@
-FROM paddledev/paddle:cpu-latest
+FROM paddlepaddle/paddle:latest
 
 COPY start.sh /root/
 COPY start_paddle.py /root/
diff --git a/doc/howto/usage/k8s/src/k8s_train/README.md b/doc/howto/usage/cluster/src/k8s_train/README.md
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_train/README.md
rename to doc/howto/usage/cluster/src/k8s_train/README.md
diff --git a/doc/howto/usage/k8s/src/k8s_train/start.sh b/doc/howto/usage/cluster/src/k8s_train/start.sh
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_train/start.sh
rename to doc/howto/usage/cluster/src/k8s_train/start.sh
diff --git a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py b/doc/howto/usage/cluster/src/k8s_train/start_paddle.py
similarity index 90%
rename from doc/howto/usage/k8s/src/k8s_train/start_paddle.py
rename to doc/howto/usage/cluster/src/k8s_train/start_paddle.py
index 935c12bb67e1fe08bc135a7a2220fcd43c548482..1774f8b640c5a2dee036db36b40123b6de7bf68c 100755
--- a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py
+++ b/doc/howto/usage/cluster/src/k8s_train/start_paddle.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/python
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
diff --git a/doc/howto/usage/k8s/src/managed_policy.png b/doc/howto/usage/cluster/src/managed_policy.png
similarity index 100%
rename from doc/howto/usage/k8s/src/managed_policy.png
rename to doc/howto/usage/cluster/src/managed_policy.png
diff --git a/doc/howto/usage/k8s/src/pserver_and_trainer.png b/doc/howto/usage/cluster/src/pserver_and_trainer.png
similarity index 100%
rename from doc/howto/usage/k8s/src/pserver_and_trainer.png
rename to doc/howto/usage/cluster/src/pserver_and_trainer.png
diff --git a/doc/howto/usage/k8s/src/route53_create_recordset.png b/doc/howto/usage/cluster/src/route53_create_recordset.png
similarity index 100%
rename from doc/howto/usage/k8s/src/route53_create_recordset.png
rename to doc/howto/usage/cluster/src/route53_create_recordset.png
diff --git a/doc/howto/usage/k8s/src/route53_create_zone.png b/doc/howto/usage/cluster/src/route53_create_zone.png
similarity index 100%
rename from doc/howto/usage/k8s/src/route53_create_zone.png
rename to doc/howto/usage/cluster/src/route53_create_zone.png
diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2.py b/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
index c0940f0e56eafa22f8aeb7052c0ddc79d8862917..d449e02023f1ec48669ce734a24093aed031adc5 100644
--- a/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import gzip
 import math
 
diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py b/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
index 2e6d8887124a5524505b097803a60a35478ca644..a5dd347f0b594e6037182bfad39a6a736fdbab66 100644
--- a/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import math
 import os
 import paddle.v2 as paddle
diff --git a/doc/howto/usage/k8s/src/worker_security_group.png b/doc/howto/usage/cluster/src/worker_security_group.png
similarity index 100%
rename from doc/howto/usage/k8s/src/worker_security_group.png
rename to doc/howto/usage/cluster/src/worker_security_group.png
diff --git a/doc/howto/usage/k8s/k8s_basis_cn.md b/doc/howto/usage/k8s/k8s_basis_cn.md
deleted file mode 100644
index 4c3dc81ed38f239c1f4a83d22b49cf57b5d16a8b..0000000000000000000000000000000000000000
--- a/doc/howto/usage/k8s/k8s_basis_cn.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# Kubernetes 简介
-
-[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统，其提供应用部署、维护、扩展机制等功能，利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行，且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)，[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)，[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前，需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识，下面先简要介绍一下本文用到的几个Kubernetes概念。
-
-- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点，这个节点可以是物理机或者虚拟机，Kubernetes集群就是由node节点与master节点组成的。
-
-- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器，pod是Kubernetes的最小调度单元，一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET，PID，IPC，UTS等Linux namespace。由于容器之间共享NET namespace，所以它们使用同一个IP地址，可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
-
-- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 描述Kubernetes上运行的作业，一次作业称为一个job，通常每个job包括一个或者多个pods，job启动后会创建这些pod并开始执行一个程序，等待这个程序执行成功并返回0则成功退出，如果执行失败，也可以配置不同的重试机制。
-
-- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷，是pod内的容器都可以访问的共享目录，也是容器与node之间共享文件的方式，因为容器内的文件都是暂时存在的，当容器因为各种原因被销毁时，其内部的文件也会随之消失。通过volume，就可以将这些文件持久化存储。Kubernetes支持多种volume，例如hostPath(宿主机目录)，gcePersistentDisk，awsElasticBlockStore等。
-
-- [*Namespaces*](https://kubernetes.io/docs/user-guide/namespaces/) 命名空间，在kubernetes中创建的所有资源对象(例如上文的pod，job)等都属于一个命名空间，在同一个命名空间中，资源对象的名字是唯一的，不同空间的资源名可以重复，命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
-
-- [*PersistentVolume*](https://kubernetes.io/docs/user-guide/persistent-volumes/): 和[*PersistentVolumeClaim*](https://kubernetes.io/docs/user-guide/persistent-volumes/#persistentvolumeclaims)结合，将外部的存储服务在Kubernetes中描述成为统一的资源形式，便于存储资源管理和Pod引用。
-
-## 部署Kubernetes集群
-
-Kubernetes提供了多种集群部署的方案，本文档内不重复介绍。这里给出集中常见的部署方法：
-
-- [*minikube*](https://kubernetes.io/docs/getting-started-guides/minikube/): 快速在本地启动一个单机的kubernetes服务器，便于本地验证和测试。
-- [*kubeadm*](http://kubernetes.io/docs/getting-started-guides/kubeadm/): 在不同操作系统，不同主机(Bare-Metal, AWS, GCE)条件下，快速部署集群。
-- [*AWS EC2*](https://kubernetes.io/docs/getting-started-guides/aws/): 在aws上快速部署集群。
-- [*Bare-Metal*](https://kubernetes.io/docs/getting-started-guides/centos/centos_manual_config/): 在物理机上手动部署。
-
-可以参考[这个表格](https://kubernetes.io/docs/getting-started-guides/#table-of-solutions)选择适合您的场景的合适方案。
-
-## 选择存储方案
-
-容器不会保留在运行时生成的数据，job或者应用程序在容器中运行时生成的数据会在容器销毁时消失。为了完成分布式机器学习训练任务，需要有一个外部的存储服务来保存训练所需数据和训练输出。
-常见的可选存储服务包括：
-
-- [*NFS*](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/nfs): 可以将磁盘上某个目录共享给网络中其他机器访问。部署和配置比较简单，可以用于小量数据的验证。不提供分布式存储，高可用，冗余等功能。NFS的部署方法可以参考[这里](http://www.tecmint.com/how-to-setup-nfs-server-in-linux/)。
-- [*GlusterFS*](http://gluster.readthedocs.io/en/latest/Quick-Start-Guide/Quickstart/): 网络分布式文件系统，可以在Kubernetes中按照[这个](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/glusterfs)例子使用。
-- [*Ceph*](http://docs.ceph.com/docs/master/): 分布式文件系统，支持rbd，POSIX API接口(ceph fs)和对象存储API，参考[这里](https://kubernetes.io/docs/user-guide/volumes/#rbd)。
-- [*MooseFS*](https://moosefs.com/documentation.html): 一个分布式的存储系统。需要先挂载到服务器Node上再通过kubernetes hostPath Volume挂载到容器中。
-
-## 配置kubectl
-
-### 安装kubectl
-```
-# OS X
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/darwin/amd64/kubectl
-
-# Linux
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
-
-# Windows
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/windows/amd64/kubectl.exe
-```
-
-### 配置kubectl访问你的kubernetes集群
-
-编辑`~/.kube/config`这个配置文件，修改`Master-IP`的地址。如果使用SSL认证，则需要配置`certificate-authority`和`users`中的用户证书。如果是使用非SSL方式访问（比如通过8080端口），也可以去掉这些证书的配置。
-```
-apiVersion: v1
-clusters:
-- cluster:
-    certificate-authority: /path/to/ca.crt
-    server: https://[Master-IP]:443
-  name: minikube
-contexts:
-- context:
-    cluster: minikube
-    user: minikube
-  name: minikube
-current-context: minikube
-kind: Config
-preferences: {}
-users:
-- name: minikube
-  user:
-    client-certificate: /path/to/apiserver.crt
-    client-key: /Users/wuyi/.minikube/apiserver.key
-```
diff --git a/doc/howto/usage/k8s/src/k8s-paddle-arch.png b/doc/howto/usage/k8s/src/k8s-paddle-arch.png
deleted file mode 100644
index 2183a232ad402b76f82a67234a5c93e13ce97ac3..0000000000000000000000000000000000000000
Binary files a/doc/howto/usage/k8s/src/k8s-paddle-arch.png and /dev/null differ
diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
index 424d7718c64438496cf0895397babd5408e1ca02..ae24ced770492743065e37654b494caf6b4c5bc0 100644
--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -1,8 +1,9 @@
 # Android平台编译指南
 
 用户可通过如下两种方式，交叉编译Android平台上适用的PaddlePaddle库：
-- 基于Docker容器的编译方式
-- 基于Linux交叉编译环境的编译方式
+
+- [基于Docker容器的编译方式](#基于docker容器的编译方式)
+- [基于Linux交叉编译环境的编译方式](#基于linux交叉编译环境的编译方式)
 
 ## 基于Docker容器的编译方式
 Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行，因此，使用基于Docker容器的编译方式，用户可在自己熟悉的开发平台上编译Android平台上适用的PaddlePaddle库。
@@ -16,6 +17,12 @@ $ cd Paddle
 $ docker build -t username/paddle-android:dev . -f Dockerfile.android
 ```
 
+用户也可以使用PaddlePaddle提供的官方开发镜像：
+
+```bash
+$ docker pull paddlepaddle/paddle:latest-dev-android
+```
+
 ### 编译PaddlePaddle C-API库
 构建好开发镜像后，即可使用开发镜像来编译Android版PaddlePaddle C-API库。
 Android的Docker开发镜像向用户提供两个可配置的参数：
@@ -41,23 +48,25 @@ Android的Docker开发镜像向用户提供两个可配置的参数：
 </tr>
 <tr class="row-odd">
   <td>ANDROID_API</td>
-  <td>>= 21</td>
+  <td>>= 16</td>
   <td>21</td>
 </tr>
 </tbody>
 </table>
 
 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
+
   ```bash
   $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
   ```
 
 - 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库
+
   ```bash
   $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
   ```
 
-执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文**配置交叉编译参数**章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
+执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
 
 ## 基于Linux交叉编译环境的编译方式
 本文档将以Linux x86-64平台为例，介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
@@ -83,6 +92,7 @@ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain
 此命令将在`your/path/to/arm_standalone_toolchain`目录生成一套独立编译工具链，面向架构为32位ARM架构，支持的最小的Android API级别为21，支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
 
 - 构建`arm64-v8a`、 `Android API 21`的独立工具链：
+
 ```bash
 your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
         --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
@@ -90,14 +100,12 @@ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain
 
 此命令将在`your/path/to/arm64_standalone_toolchain`目录生成一套独立编译工具链，面向架构为64位ARM64架构，支持的最小Android API级别为21，支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
 
-注意：**PaddlePaddle要求使用的编译工具链所支持的Android API级别不小于21**。
-
 ### 配置交叉编译参数
 
 CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置，PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake)，以提供一些默认的编译器和编译参数相关配置。注意，从CMake 3.7版本开始，CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时，将会将用户传进来的配置参数传递CMake系统，交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。
 
 交叉编译Android版本的PaddlePaddle库时，有一些必须配置的参数：
-- `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后，PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本，并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及Android所需`arm_soft_fp_abi`分支的目标机版OpenBLAS库。此外，还会强制设置一些PaddlePaddle参数的值（`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`）。
+- `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后，PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本，并自动编译PaddlePaddle所需的所有第三方库。此外，还会强制设置一些PaddlePaddle参数的值（`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`、`WITH_MKL=OFF`、`WITH_GOLANG=OFF`）。
 - `WITH_C_API`，必须设置为`ON`。在Android平台上只支持使用C-API来预测。
 - `WITH_SWIG_PY`，必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。
 
@@ -119,7 +127,7 @@ Android平台可选配置参数：
 其他配置参数：
 
 - `USE_EIGEN_FOR_BLAS`，是否使用Eigen库进行矩阵计算。可设置`ON/OFF`，默认值为`OFF`。
-- `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值；若环境变量`CC`没有设置，则设置成`cc`编译器。
+- `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC/CXX`的值；若环境变量`CC/CXX`没有设置，则设置成`cc/c++`编译器。
 
 常用的cmake配置如下：
 
@@ -147,9 +155,10 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
       ..
 ```
 
-用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
+用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
 
 **性能TIPS**，为了达到最快的计算速度，在CMake参数配置上，有以下建议：
+
 - 设置`CMAKE_BUILD_TYPE`为`Release`
 - 使用`clang`编译工具链
 - `armeabi-v7a`时，设置`USE_EIGEN_BLAS=ON`，使用Eigen进行矩阵计算；`arm64-v8a`时，设置`USE_EIGEN_FOR_BLAS=OFF`，使用OpenBLAS进行矩阵计算
diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md
index 26858581fc1d77a9391520ac0dfd80fbd98f508c..0cf50181df4116beda3aa6faf836eda92edf6066 100644
--- a/doc/mobile/cross_compiling_for_android_en.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -1,6 +1,9 @@
 # Build PaddlePaddle for Android
 
-There are two approaches to build PaddlePaddle for Android: using Docker and on Linux without Docker. 
+There are two approaches to build PaddlePaddle for Android: 
+
+- [Cross-Compiling Using Docker](#cross-compiling-using-docker)
+- [Cross-Compiling on Linux](#cross-compiling-on-linux) 
 
 ## Cross-Compiling Using Docker
 
@@ -16,6 +19,12 @@ $ cd Paddle
 $ docker build -t paddle:dev-android . -f Dockerfile.android
 ```
 
+Users can directly use the published Docker image.
+
+```bash
+$ docker pull paddlepaddle/paddle:latest-dev-android
+```
+
 ### Build the Inference Library
 
 We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
@@ -47,7 +56,7 @@ The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
 </tr>
 <tr class="row-odd">
   <td>ANDROID_API</td>
-  <td>>= 21</td>
+  <td>>= 16</td>
   <td>21</td>
 </tr>
 </tbody>
@@ -93,15 +102,13 @@ Android NDK includes everything we need to build the [*standalone toolchain*](ht
 
   The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.
 
-**Please be aware that the minimum level of Android API required by PaddlePaddle is 21.**
-
 ### Cross-Compiling Arguments
 
 CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling).  PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake.  `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling).
 
 Some other CMake arguments you need to know:
 
-- `CMAKE_SYSTEM_NAME` must be `Android`.  This tells PaddlePaddle's CMake system to cross-compile third-party dependencies.  This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, and `WITH_RDMA=OFF`.
+- `CMAKE_SYSTEM_NAME` must be `Android`.  This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, `WITH_RDMA=OFF`, `WITH_MKL=OFF` and `WITH_GOLANG=OFF`.
 - `WITH_C_API` must be `ON`, to build the C-based inference library for Android.
 - `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API.
 
@@ -123,7 +130,7 @@ Some Android-specific arguments:
 Other useful arguments:
 
 - `USE_EIGEN_FOR_BLAS`: indicates if using Eigen.  Could be `ON` or `OFF`, defaults to `OFF`.
-- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS.  It defaults to the value of the environment variable `CC`, or `cc`.
+- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS.  It defaults to the value of the environment variable `CC/C++`, or `cc/c++`.
 
 Some frequent configurations for your reference:
 
@@ -158,6 +165,7 @@ There are some other arguments you might want to configure.
 - `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance.
 
 Our own tip for performance optimization to use clang and Eigen or OpenBLAS:
+
 - `CMAKE_BUILD_TYPE=Release`
 - `ANDROID_TOOLCHAIN=clang`
 - `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`.
diff --git a/doc/mobile/cross_compiling_for_ios_cn.md b/doc/mobile/cross_compiling_for_ios_cn.md
index 9da48e7f2119ce901fbb3abab73400df27be16d2..d5196d9a4c93c7692d2a624ec7d0650e32806338 100644
--- a/doc/mobile/cross_compiling_for_ios_cn.md
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
@@ -18,11 +18,11 @@ PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/
 
 - `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须设置为`iOS`。在设置`CMAKE_SYSTEM_NAME=iOS`后，PaddlePaddle的CMake系统会自动编译所有的第三方依赖库，并且强制设置一些PaddlePaddle参数的值（`WITH_C_API=ON`、`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`）。
 - `WITH_C_API`，是否编译C-API预测库，必须设置为ON。在iOS平台上只支持使用C-API来预测。
-- `WITH_SWIG_PY`，必须设置为ON。在iOS平台上不支持通过swig调用来训练或者预测。
+- `WITH_SWIG_PY`，必须设置为`OFF`。在iOS平台上不支持通过swig调用来训练或者预测。
 
 iOS平台可选配置参数：
 
-- `IOS_PLATFORM`，可设置为`OS/SIMULATOR`，默认值为`OS`。
+- `IOS_PLATFORM`，可设置为`OS`（默认值）或`SIMULATOR`。
   - `OS`，构建目标为`arm`架构的iPhone或者iPad等物理设备。
   - `SIMULATOR`，构建目标为`x86`架构的模拟器平台。
 - `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示，默认编译所有架构：
diff --git a/doc/mobile/cross_compiling_for_ios_en.md b/doc/mobile/cross_compiling_for_ios_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..19bfe86c511c7e43b462f94c8cabba420b3007f1
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_ios_en.md
@@ -0,0 +1,120 @@
+# Build PaddlePaddle for iOS
+
+This tutorial will walk you through cross compiling the PaddlePaddle library for iOS from the source in MacOS.
+
+## Preparation
+
+Apple provides Xcode for cross-compiling and IDE for iOS development. Download from App store or [here](https://developer.apple.com/cn/xcode/). To verify your installation, run command as follows
+
+```bash
+$ xcodebuild -version
+Xcode 9.0
+Build version 9A235
+```
+
+## Cross-compiling configurations
+
+PaddlePaddle provides cross-compiling toolchain configuration documentation [cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake), which has some default settings for frequently used compilers.
+
+There are some mandatory environment variables need to be set before cross compiling PaddlePaddle for iOS:
+
+- `CMAKE_SYSTEM_NAME`, CMake compiling target platform name, has to be `iOS`. PaddlePaddle CMake will compile all the third party dependencies and enforce some parameters (`WITH_C_API=ON`, `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`,`WITH_RDMA=OFF`) when this variable is set with value `iOS`.
+
+- `WITH_C_API`, Whether to compile inference C-API library, has to be `ON`, since C-API is the only supported interface for inferencing in iOS.
+- `WITH_SWIG_PY`, has to be `OFF`. It's not supported to inference or train via swig in iOS.
+
+Optional environment variables for iOS are:
+
+- `IOS_PLATFORM`, either `OS` (default) or `SIMULATOR`.
+  - `OS`, build targets ARM-based physical devices like iPhone or iPad.
+  - `SIMULATOR`, build targets x86 architecture simulators.
+- `IOS_ARCH`, target architecture. By default, all architecture types will be compiled. If you need to specify the architecture to compile for, please find valid values for different `IOS_PLATFORM` settings from the table below:
+
+    <table class="docutils">
+    <colgroup>
+      <col width="35%" />
+      <col width="65%" />
+    </colgroup>
+    <thead valign="bottom">
+      <tr class="row-odd">
+      <th class="head">IOS_PLATFORM</th>
+      <th class="head">IOS_ARCH</th>
+    </tr>
+    </thead>
+    <tbody valign="top">
+      <tr class="row-even">
+      <td>OS</td>
+      <td>armv7, armv7s, arm64 </td>
+    </tr>
+    <tr class="row-odd">
+      <td>SIMULATOR</td>
+      <td>i386, x86_64 </td>
+    </tr>
+    </tbody>
+    </table>
+
+- `IOS_DEPLOYMENT_TARGET`, minimum iOS version to deployment, `7.0` by default.
+- `IOS_ENABLE_BITCODE`, whether to enable [Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3), values can be `ON/OFF`, `ON` by default.
+- `IOS_USE_VECLIB_FOR_BLAS`, whether to use [vecLib](https://developer.apple.com/documentation/accelerate/veclib) framework for BLAS computing. values can be `ON/OFF`, `OFF` by default.
+- `IOS_DEVELOPMENT_ROOT`, the path to `Developer` directory, can be explicitly set with your `/path/to/platform/Developer`. If left blank, PaddlePaddle will automatically pick the Xcode corresponding `platform`'s `Developer` directory based on your `IOS_PLATFORM` value.
+- `IOS_SDK_ROOT`, the path to `SDK` root, can be explicitly set with your  `/path/to/platform/Developer/SDKs/SDK`. if left black, PaddlePaddle will pick the latest SDK in the directory of `IOS_DEVELOPMENT_ROOT`.
+
+other settings：
+
+- `USE_EIGEN_FOR_BLAS`, whether to use Eigen for matrix computing. effective when `IOS_USE_VECLIB_FOR_BLAS=OFF`. Values can be `ON/OFF`, `OFF` by default.
+- `HOST_C/CXX_COMPILER`, host C/C++ compiler. Uses value from environment variable `CC/CXX` by default or `cc/c++` if `CC/CXX` doesn't exist.
+
+some typical cmake configurations:
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=OS \
+      -DIOS_ARCH="armv7;arm64" \
+      -DIOS_ENABLE_BITCODE=ON \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=SIMULATOR \
+      -DIOS_ARCH="x86_64" \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+You can set other compiling parameters for your own need. I.E. if you are trying to minimize the library size, set `CMAKE_BUILD_TYPE` with `MinSizeRel`; or if the performance is your concern, set `CMAKE_BUILD_TYPE` with `Release`. You can even manipulate the PaddlePaddle compiling procedure by manually set `CMAKE_C/CXX_FLAGS` values.
+
+**TIPS for a better performance**:
+
+- set `CMAKE_BUILD_TYPE` with `Release`
+- set `IOS_USE_VECLIB_FOR_BLAS` with `ON`
+
+## Build and install
+
+After CMake, run following commands, PaddlePaddle will download the compile 3rd party dependencies, compile and install PaddlePaddle inference library.
+
+```
+$ make
+$ make install
+```
+
+Please Note: if you compiled PaddlePaddle in the source directory for other platforms, do remove `third_party` and `build` directory within the source with `rm -rf` to ensure that all the 3rd party libraries dependencies and PaddlePaddle is newly compiled with current CMake configuration.
+
+`your/path/to/install` directory will have following directories after `make install`:
+
+- `include`, contains all the C-API header files.
+- `lib`, contains PaddlePaddle C-API static library.
+- `third_party` contains all the 3rd party libraries.
+
+Please note: if PaddlePaddle library need to support both physical devices and simulators, you will need to compile correspondingly, then merge fat library with `lipo`.
+
+Now you will have PaddlePaddle library compiled and installed, the fat library can be used in deep learning related iOS APPs. Please refer to C-API documentation for usage guides.
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
index 3c08d736717cfe8d5fdf449dc58015086befbe60..ef421dacad458828cadf8cf505375d6c4bfd9dde 100644
--- a/doc/mobile/index_en.rst
+++ b/doc/mobile/index_en.rst
@@ -5,4 +5,5 @@ MOBILE
   :maxdepth: 1
 
   cross_compiling_for_android_en.md
+  cross_compiling_for_ios_en.md
   cross_compiling_for_raspberry_en.md
diff --git a/go/pserver/client/c/test/test_cclient.c b/go/pserver/client/c/test/test_cclient.c
index 89c4d7f00aae2a92ae30ba7b4305550d150dd985..05ec421fff6e1c57b0bace080668d3793f85480f 100644
--- a/go/pserver/client/c/test/test_cclient.c
+++ b/go/pserver/client/c/test/test_cclient.c
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/go/pserver/client/c/test/test_mnist.py b/go/pserver/client/c/test/test_mnist.py
index c3a3af55e2812fa0c965d22ddaba198f43f3c4ad..7b50a10afc68b87f331c4d0afede9413b3aa2d35 100644
--- a/go/pserver/client/c/test/test_mnist.py
+++ b/go/pserver/client/c/test/test_mnist.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import paddle.v2 as paddle
 import gzip
 
diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py
index 8d9c6b9b20f515ed0865df8cf46b6dfc2d8ffa34..7ef0fca496e8a3836d1a38b0ff576652d72ce177 100644
--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import paddle.v2 as paddle
 import paddle.v2.dataset.uci_housing as uci_housing
 import paddle.v2.master as master
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 7d2becbdd772747d77890321fce6721d8d17fb30..4a98ede278fad85ff2beef3c8e7dd158912f693a 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -24,6 +24,7 @@ else()
     add_subdirectory(framework)
     add_subdirectory(operators)
     add_subdirectory(pybind)
+    add_subdirectory(inference)
   endif()
 
   if(WITH_SWIG_PY)
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index d6b8464100d4497876aa3f6f7cbc666aafae4bfc..cf84568ecdf1227b0d0ed3606a4a9a6e5186af72 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -25,8 +25,18 @@ FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py)
 
 SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
 
+SET(SWIG_NEED_FLAGS
+    -ftls-model=global-dynamic
+    -Wno-parentheses-equality
+    -Wno-self-assign
+    -Wno-maybe-uninitialized
+    -Wno-missing-field-initializers)
+  FOREACH(flag ${SWIG_NEED_FLAGS})
+  safe_set_cxxflag(SWIG_CXX_FLAGS ${flag})
+ENDFOREACH()
+
 SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign -ftls-model=global-dynamic")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SWIG_CXX_FLAGS}")
 
 SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
     paddle_parameter
diff --git a/paddle/api/test/testTrainConfig.py b/paddle/api/test/testTrainConfig.py
index 77e0cd37d566d2571fada76b9948a9b0616ad044..ab9a83e4a35305bcab42a5e05d8f4880f4edd9bb 100644
--- a/paddle/api/test/testTrainConfig.py
+++ b/paddle/api/test/testTrainConfig.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=100, learning_method=AdamOptimizer())
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index d267b14657be2a773d1dacfd9ac3767cddc47415..ebb083c5a477d5be91ef14be74dd9de349d07931 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -4,6 +4,16 @@ else ()
   set(PADDLE_FLOAT_TYPE float)
 endif()
 
+execute_process(
+  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_GIT_COMMIT
+  RESULT_VARIABLE PADDLE_GIT_COMMIT_RESULT
+  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(NOT PADDLE_GIT_COMMIT)
+  set(PADDLE_GIT_COMMIT "no commit information")
+endif()
+
 # config.h used for C-API. It will store Paddle building configuration as a
 # header. Make user just include PaddleCAPI.h then can get building
 # configuration without explicitly set -DPADDLE_WITH_DOUBLE when building their
diff --git a/paddle/capi/Main.cpp b/paddle/capi/Main.cpp
index bb8249a5511c089ec2f2263ff4cc290f0a5a8fce..c038789340033fcf6dcc07a41b033a50e980c965 100644
--- a/paddle/capi/Main.cpp
+++ b/paddle/capi/Main.cpp
@@ -43,4 +43,11 @@ paddle_error paddle_init(int argc, char** argv) {
   isInit = true;
   return kPD_NO_ERROR;
 }
+
+paddle_error paddle_init_thread() {
+  if (FLAGS_use_gpu) {
+    hl_init(FLAGS_gpu_id);
+  }
+  return kPD_NO_ERROR;
+}
 }
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index d5b55e1c95f248f551e6a0a3b39123169dd7784f..cbacd1fb71c14f490ff548db714e728772292b4b 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -40,7 +40,7 @@ paddle_error paddle_matrix_destroy(paddle_matrix mat) {
 paddle_error paddle_matrix_set_row(paddle_matrix mat,
                                    uint64_t rowID,
                                    paddle_real* rowArray) {
-  if (mat == nullptr) return kPD_NULLPTR;
+  if (mat == nullptr || rowArray == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
   if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
@@ -55,7 +55,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
 }
 
 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value) {
+                                            paddle_real* value) {
   if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
@@ -75,7 +75,7 @@ PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
 }
 
 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result) {
+                                            paddle_real* result) {
   if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
diff --git a/paddle/capi/config.h.in b/paddle/capi/config.h.in
index d205307588eb60b2e11accb9f825391f7c1453f2..0ddbd8c753c55ab95a89e1781c64b9416f7344e7 100644
--- a/paddle/capi/config.h.in
+++ b/paddle/capi/config.h.in
@@ -3,6 +3,9 @@
 
 typedef @PADDLE_FLOAT_TYPE@ paddle_real;
 
+#define __PADDLE_VERSION__  "@PADDLE_VERSION@"
+#define __PADDLE_COMMIT__   "@PADDLE_GIT_COMMIT@"
+
 // Since we only support linux and macos in compile, always use clang or
 // gcc 4.8+. DLL_IMPORT/DLL_EXPORT is as simple as below.
 #define PD_API __attribute__((visibility("default")))
diff --git a/paddle/capi/error.cpp b/paddle/capi/error.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..96ce31b45fc3f83237146443cbe4289af7bfa239
--- /dev/null
+++ b/paddle/capi/error.cpp
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "error.h"
+
+extern "C" const char* paddle_error_string(paddle_error err) {
+  switch (err) {
+    case kPD_NULLPTR:
+      return "nullptr error";
+    case kPD_OUT_OF_RANGE:
+      return "out of range error";
+    case kPD_PROTOBUF_ERROR:
+      return "protobuf error";
+    case kPD_NOT_SUPPORTED:
+      return "not supported error";
+    case kPD_UNDEFINED_ERROR:
+      return "undefined error";
+    default:
+      return "";
+  }
+}
diff --git a/paddle/capi/error.h b/paddle/capi/error.h
index 44d8c2040d1aad698398089baeee6f13c3deeb55..2da9e0a3ef604fbcd53bac271c72ef33b3105152 100644
--- a/paddle/capi/error.h
+++ b/paddle/capi/error.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifndef __PADDLE_CAPI_ERROR_H__
 #define __PADDLE_CAPI_ERROR_H__
 
+#include "config.h"
+
 /**
  * Error Type for Paddle API.
  */
@@ -27,4 +29,17 @@ typedef enum {
   kPD_UNDEFINED_ERROR = -1,
 } paddle_error;
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Error string for Paddle API.
+ */
+PD_API const char* paddle_error_string(paddle_error err);
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/paddle/capi/examples/model_inference/common/common.h b/paddle/capi/examples/model_inference/common/common.h
index e32f2f9836f63ba10ef5be447a4c41514e079219..9efcbc387e6080f8c9ed284412d1abced658a0cd 100644
--- a/paddle/capi/examples/model_inference/common/common.h
+++ b/paddle/capi/examples/model_inference/common/common.h
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #ifndef __CAPI_EXAMPLE_COMMON_H__
 #define __CAPI_EXAMPLE_COMMON_H__
 #include <stdio.h>
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/capi/examples/model_inference/dense/main.c
index 876af2aa7615c098d225b56ce2ea0b1529a6e3c6..376cd46fb09a156d426453986c87dcff6e2f71dd 100644
--- a/paddle/capi/examples/model_inference/dense/main.c
+++ b/paddle/capi/examples/model_inference/dense/main.c
@@ -1,72 +1,97 @@
 #include <paddle/capi.h>
 #include <time.h>
+
 #include "../common/common.h"
 
+// Modify this path as needed.
 #define CONFIG_BIN "./trainer_config.bin"
+// Modify this path as needed.
+// This demo assumes that merged model is not used, then this path is the
+// directory storing all the trained parameters.
+// If the model is trained by PaddlePaddle V2 API, the model is saved as
+// a compressed file. You need to uncompress the compressed file first.
+#define MODEL_PATH "models/pass_4"
 
 int main() {
-  // Initalize Paddle
+  // Initalize the PaddlePaddle runtime environment.
   char* argv[] = {"--use_gpu=False"};
   CHECK(paddle_init(1, (char**)argv));
 
-  // Reading config binary file. It is generated by `convert_protobin.sh`
+  // Read the binary configuration file generated by `convert_protobin.sh`
   long size;
   void* buf = read_config(CONFIG_BIN, &size);
 
-  // Create a gradient machine for inference.
+  // Create the gradient machine for inference.
   paddle_gradient_machine machine;
   CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
-  CHECK(paddle_gradient_machine_randomize_param(machine));
 
-  // Loading parameter. Uncomment the following line and change the directory.
-  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
-  //                                                "./some_where_to_params"));
+  // Load the trained model. Modify the parameter MODEL_PATH to set the correct
+  // path of the trained model.
+  CHECK(paddle_gradient_machine_load_parameter_from_disk(machine, MODEL_PATH));
+
+  // Inputs and outputs of the network are organized as paddle_arguments object
+  // in C-API. In the comments below, "argument" specifically means one input of
+  // the neural network in PaddlePaddle C-API.
   paddle_arguments in_args = paddle_arguments_create_none();
 
-  // There is only one input of this network.
+  // There is only one data layer in this demo MNIST network, invoke this
+  // function to create one argument.
   CHECK(paddle_arguments_resize(in_args, 1));
 
-  // Create input matrix.
-  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 10,
-                                           /* size */ 784,
-                                           /* useGPU */ false);
+  // Each argument needs one matrix or one ivector (integer vector, for sparse
+  // index input, usually used in NLP task) to holds the real input data.
+  // In the comments below, "matrix" specifically means the object needed by
+  // argument to hold the data. Here we create the matrix for the above created
+  // agument to store the testing samples.
+  paddle_matrix mat =
+      paddle_matrix_create(/* height = batch size */ 1,
+                           /* width = dimensionality of the data layer */ 784,
+                           /* whether to use GPU */ false);
+
+  paddle_real* array;
+  // Get the pointer pointing to the start address of the first row of the
+  // created matrix.
+  CHECK(paddle_matrix_get_row(mat, 0, &array));
+
+  // Fill the matrix with a randomly generated test sample.
   srand(time(0));
-
-  std::vector<paddle_real> input;
-  input.resize(784 * 10);
-
-  for (int i = 0; i < input.size(); ++i) {
-    input[i] = rand() / ((float)RAND_MAX);
+  for (int i = 0; i < 784; ++i) {
+    array[i] = rand() / ((float)RAND_MAX);
   }
-  
-  // Set value for the input matrix
-  CHECK(paddle_matrix_set_value(mat, input.data()));
 
+  // Assign the matrix to the argument.
   CHECK(paddle_arguments_set_value(in_args, 0, mat));
 
+  // Create the output argument.
   paddle_arguments out_args = paddle_arguments_create_none();
+
+  // Invoke the forward computation.
   CHECK(paddle_gradient_machine_forward(machine,
                                         in_args,
                                         out_args,
-                                        /* isTrain */ false));
-  paddle_matrix prob = paddle_matrix_create_none();
+                                        /* is train taks or not */ false));
 
+  // Create the matrix to hold the forward result of the neural network.
+  paddle_matrix prob = paddle_matrix_create_none();
+  // Access the matrix of the output argument, the predicted result is stored in
+  // which.
   CHECK(paddle_arguments_get_value(out_args, 0, prob));
 
-  std::std::vector<paddle_real> result;
-  int height;
-  int width;
-
-  CHECK(paddle_matrix_get_shape(prob, &height, &width);
-  result.resize(height * width);
-  CHECK(paddle_matrix_get_value(prob, result.data()));
+  uint64_t height;
+  uint64_t width;
+  CHECK(paddle_matrix_get_shape(prob, &height, &width));
+  CHECK(paddle_matrix_get_row(prob, 0, &array));
 
-  printf("Prob: ");
+  printf("Prob: \n");
   for (int i = 0; i < height * width; ++i) {
-    printf("%.2f ", result[i]);
+    printf("%.4f ", array[i]);
+    if ((i + 1) % width == 0) {
+      printf("\n");
+    }
   }
   printf("\n");
 
+  // The cleaning up.
   CHECK(paddle_matrix_destroy(prob));
   CHECK(paddle_arguments_destroy(out_args));
   CHECK(paddle_matrix_destroy(mat));
diff --git a/paddle/capi/examples/model_inference/dense/merge_v2_model.py b/paddle/capi/examples/model_inference/dense/merge_v2_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..760a485a53f5edd31622a0f25a7bf32e6230e9a4
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/merge_v2_model.py
@@ -0,0 +1,21 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from paddle.utils.merge_model import merge_v2_model
+
+from mnist_v2 import network
+
+net = network(is_infer=True)
+param_file = "models/params_pass_4.tar"
+output_file = "output.paddle.model"
+merge_v2_model(net, param_file, output_file)
diff --git a/paddle/capi/examples/model_inference/dense/mnist_v2.py b/paddle/capi/examples/model_inference/dense/mnist_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..174436bd1d7c1efaf8b25b002ada38b6babeba1c
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/mnist_v2.py
@@ -0,0 +1,130 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import os
+import sys
+import gzip
+import logging
+import argparse
+from PIL import Image
+import numpy as np
+
+import paddle.v2 as paddle
+from paddle.utils.dump_v2_config import dump_v2_config
+
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+
+
+def multilayer_perceptron(img, layer_size, lbl_dim):
+    for idx, size in enumerate(layer_size):
+        hidden = paddle.layer.fc(input=(img if not idx else hidden),
+                                 size=size,
+                                 act=paddle.activation.Relu())
+    return paddle.layer.fc(input=hidden,
+                           size=lbl_dim,
+                           act=paddle.activation.Softmax())
+
+
+def network(input_dim=784, lbl_dim=10, is_infer=False):
+    images = paddle.layer.data(
+        name='pixel', type=paddle.data_type.dense_vector(input_dim))
+
+    predict = multilayer_perceptron(
+        images, layer_size=[128, 64], lbl_dim=lbl_dim)
+
+    if is_infer:
+        return predict
+    else:
+        label = paddle.layer.data(
+            name='label', type=paddle.data_type.integer_value(lbl_dim))
+        return paddle.layer.classification_cost(input=predict, label=label)
+
+
+def main(task="train", use_gpu=False, trainer_count=1, save_dir="models"):
+    if task == "train":
+        if not os.path.exists(save_dir):
+            os.mkdir(save_dir)
+
+        paddle.init(use_gpu=use_gpu, trainer_count=trainer_count)
+        cost = network()
+        parameters = paddle.parameters.create(cost)
+        optimizer = paddle.optimizer.Momentum(
+            learning_rate=0.1 / 128.0,
+            momentum=0.9,
+            regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
+
+        trainer = paddle.trainer.SGD(cost=cost,
+                                     parameters=parameters,
+                                     update_equation=optimizer)
+
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 100 == 0:
+                    logger.info("Pass %d, Batch %d, Cost %f, %s" %
+                                (event.pass_id, event.batch_id, event.cost,
+                                 event.metrics))
+            if isinstance(event, paddle.event.EndPass):
+                with gzip.open(
+                        os.path.join(save_dir, "params_pass_%d.tar" %
+                                     event.pass_id), "w") as f:
+                    trainer.save_parameter_to_tar(f)
+
+        trainer.train(
+            reader=paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.mnist.train(), buf_size=8192),
+                batch_size=128),
+            event_handler=event_handler,
+            num_passes=5)
+    elif task == "dump_config":
+        predict = network(is_infer=True)
+        dump_v2_config(predict, "trainer_config.bin", True)
+    else:
+        raise RuntimeError(("Error value for parameter task. "
+                            "Available options are: train and dump_config."))
+
+
+def parse_cmd():
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle MNIST demo for CAPI.")
+    parser.add_argument(
+        "--task",
+        type=str,
+        required=False,
+        help=("A string indicating the taks type. "
+              "Available options are: \"train\", \"dump_config\"."),
+        default="train")
+    parser.add_argument(
+        "--use_gpu",
+        type=bool,
+        help=("A bool flag indicating whether to use GPU device or not."),
+        default=False)
+    parser.add_argument(
+        "--trainer_count",
+        type=int,
+        help=("This parameter is only used in training task. It indicates "
+              "how many computing threads are created in training."),
+        default=1)
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        help=("This parameter is only used in training task. It indicates "
+              "path of the directory to save the trained models."),
+        default="models")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_cmd()
+    main(args.task, args.use_gpu, args.trainer_count, args.save_dir)
diff --git a/paddle/capi/examples/model_inference/dense/trainer_config.py b/paddle/capi/examples/model_inference/dense/trainer_config.py
index 873ec119e7a3d4debe50af2ba259ace50b0cbf7c..fbf08903578b309ee163b1f4c70b80133a4324b8 100644
--- a/paddle/capi/examples/model_inference/dense/trainer_config.py
+++ b/paddle/capi/examples/model_inference/dense/trainer_config.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 img = data_layer(name='pixel', size=784)
diff --git a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
index 98e411ddc02a46034e8f6ceb00657622d998c9f3..2fc8debddedeab6ae982b0df49ec2b73bc0f85f5 100644
--- a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
+++ b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
@@ -1,8 +1,29 @@
 project(multi_thread)
 cmake_minimum_required(VERSION 2.8)
-aux_source_directory(. SRC_LIST)
-add_executable(${PROJECT_NAME} ${SRC_LIST})
+
 find_package (Threads)
+
+if(NOT PADDLE_ROOT)
+  set(PADDLE_ROOT $ENV{PADDLE_ROOT} CACHE PATH "Paddle Path")
+endif()
+if(PADDLE_ROOT)
+  include_directories(${PADDLE_ROOT}/include)
+  link_directories(${PADDLE_ROOT}/lib)
+endif()
+
+set(CPU_SRCS main.c)
+add_executable(${PROJECT_NAME} ${CPU_SRCS})
 set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
-target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared
-  ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${PROJECT_NAME}
+                      -lpaddle_capi_shared
+                      ${CMAKE_THREAD_LIBS_INIT})
+
+find_package(CUDA QUIET)
+if(CUDA_FOUND)
+  set(GPU_SRCS main_gpu.c)
+  cuda_add_executable(${PROJECT_NAME}_gpu ${GPU_SRCS})
+  set_property(TARGET ${PROJECT_NAME}_gpu PROPERTY C_STANDARD 99)
+  target_link_libraries(${PROJECT_NAME}_gpu
+                        -lpaddle_capi_shared
+                        ${CMAKE_THREAD_LIBS_INIT})
+endif(CUDA_FOUND)
diff --git a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
new file mode 100644
index 0000000000000000000000000000000000000000..6fd376e0d1a2fee4f9a0f676b53c6f2891795cab
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
@@ -0,0 +1,113 @@
+#include <paddle/capi.h>
+#include <pthread.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+#define NUM_THREAD 4
+#define NUM_ITER 1000
+
+pthread_mutex_t mutex;
+
+/*
+ * @brief It is an simple inference example that runs multi-threads on a GPU.
+ *        Each thread holds it own local gradient_machine but shares the same
+ *        parameters.
+ *        If you want to run on different GPUs, you need to launch
+ *        multi-processes or set trainer_count > 1.
+ */
+void* thread_main(void* gm_ptr) {
+  // Initialize the thread environment of Paddle.
+  CHECK(paddle_init_thread());
+
+  paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr);
+  // Create input arguments.
+  paddle_arguments in_args = paddle_arguments_create_none();
+  // Create input matrix.
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+                                           /* size */ 784,
+                                           /* useGPU */ true);
+  // Create output arguments.
+  paddle_arguments out_args = paddle_arguments_create_none();
+  // Create output matrix.
+  paddle_matrix prob = paddle_matrix_create_none();
+
+  // CPU buffer to cache the input and output.
+  paddle_real* cpu_input = (paddle_real*)malloc(784 * sizeof(paddle_real));
+  paddle_real* cpu_output = (paddle_real*)malloc(10 * sizeof(paddle_real));
+  for (int iter = 0; iter < NUM_ITER; ++iter) {
+    // There is only one input layer of this network.
+    CHECK(paddle_arguments_resize(in_args, 1));
+    CHECK(paddle_arguments_set_value(in_args, 0, mat));
+
+    for (int i = 0; i < 784; ++i) {
+      cpu_input[i] = rand() / ((float)RAND_MAX);
+    }
+    CHECK(paddle_matrix_set_value(mat, cpu_input));
+
+    CHECK(paddle_gradient_machine_forward(machine,
+                                          in_args,
+                                          out_args,
+                                          /* isTrain */ false));
+
+    CHECK(paddle_arguments_get_value(out_args, 0, prob));
+    CHECK(paddle_matrix_get_value(prob, cpu_output));
+
+    pthread_mutex_lock(&mutex);
+    printf("Prob: ");
+    for (int i = 0; i < 10; ++i) {
+      printf("%.2f ", cpu_output[i]);
+    }
+    printf("\n");
+    pthread_mutex_unlock(&mutex);
+  }
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_matrix_destroy(mat));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+
+  free(cpu_input);
+  free(cpu_output);
+
+  return NULL;
+}
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=True"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  srand(time(0));
+  pthread_mutex_init(&mutex, NULL);
+
+  pthread_t threads[NUM_THREAD];
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    paddle_gradient_machine thread_local_machine;
+    CHECK(paddle_gradient_machine_create_shared_param(
+        machine, buf, size, &thread_local_machine));
+    pthread_create(&threads[i], NULL, thread_main, thread_local_machine);
+  }
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    pthread_join(threads[i], NULL);
+  }
+
+  pthread_mutex_destroy(&mutex);
+
+  return 0;
+}
diff --git a/paddle/capi/examples/model_inference/sequence/trainer_config.py b/paddle/capi/examples/model_inference/sequence/trainer_config.py
index 6bbc7a909aa03950ce621efa43fa47d9cdd016f8..c1326bb95550dc3a7ac6385a81158b5bbd88d5af 100644
--- a/paddle/capi/examples/model_inference/sequence/trainer_config.py
+++ b/paddle/capi/examples/model_inference/sequence/trainer_config.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 WORD_DIM = 3000
diff --git a/paddle/capi/examples/model_inference/sparse_binary/main.c b/paddle/capi/examples/model_inference/sparse_binary/main.c
index 8ba67aee560239d3050c7f40198d20df99ec370e..029b94ee63ba282aa48193ffd4f625657ddc3a60 100644
--- a/paddle/capi/examples/model_inference/sparse_binary/main.c
+++ b/paddle/capi/examples/model_inference/sparse_binary/main.c
@@ -1,5 +1,6 @@
 #include <paddle/capi.h>
 #include <time.h>
+
 #include "../common/common.h"
 
 #define CONFIG_BIN "./trainer_config.bin"
@@ -9,16 +10,18 @@ int main() {
   char* argv[] = {"--use_gpu=False"};
   CHECK(paddle_init(1, (char**)argv));
 
-  // Reading config binary file. It is generated by `convert_protobin.sh`
+  // Read the binary configuration file which is generated by
+  // `convert_protobin.sh`
   long size;
   void* buf = read_config(CONFIG_BIN, &size);
 
-  // Create a gradient machine for inference.
+  // Create the gradient machine for inference.
   paddle_gradient_machine machine;
   CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
   CHECK(paddle_gradient_machine_randomize_param(machine));
 
-  // Loading parameter. Uncomment the following line and change the directory.
+  // Load the trained parameters. Uncomment the following line and change the
+  // directory as needed.
   // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
   //                                                "./some_where_to_params"));
   paddle_arguments in_args = paddle_arguments_create_none();
@@ -26,7 +29,7 @@ int main() {
   // There is only one input of this network.
   CHECK(paddle_arguments_resize(in_args, 1));
 
-  // Create input matrix.
+  // Create the input matrix.
   paddle_matrix mat = paddle_matrix_create_sparse(1, 784, 3, true, false);
   srand(time(0));
   paddle_real* array;
diff --git a/paddle/capi/gradient_machine.cpp b/paddle/capi/gradient_machine.cpp
index 482b51e8a8430863c3e13df2298f6979d3959461..1f0e033c5b7ab713f1747d58f4fcdaad001f0715 100644
--- a/paddle/capi/gradient_machine.cpp
+++ b/paddle/capi/gradient_machine.cpp
@@ -168,3 +168,13 @@ paddle_error paddle_gradient_machine_get_layer_output(
   out->args.push_back(layerOutput);
   return kPD_NO_ERROR;
 }
+
+paddle_error paddle_gradient_machine_release_layer_output(
+    paddle_gradient_machine machine) {
+  auto m = cast(machine);
+  if (m == nullptr || m->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+  m->machine->releaseOutput();
+  return kPD_NO_ERROR;
+}
diff --git a/paddle/capi/gradient_machine.h b/paddle/capi/gradient_machine.h
index 28eeb23e3bbdd4cc22a25c14170bf56c294f8cd7..7e37dea00b27b6cb955486b4210c1373decbcfa5 100644
--- a/paddle/capi/gradient_machine.h
+++ b/paddle/capi/gradient_machine.h
@@ -113,6 +113,14 @@ paddle_gradient_machine_get_layer_output(paddle_gradient_machine machine,
                                          const char* layerName,
                                          paddle_arguments args);
 
+/**
+ * @brief Release the middle layer's output memory of the gradient machine.
+ * @param [in] gradient machine that have run a inference
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_release_layer_output(paddle_gradient_machine machine);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/paddle/capi/main.h b/paddle/capi/main.h
index 893ebcbd58dd24cf835fb2005865c94c9ba2a810..99c4e8428dbaa14d36dc2d36b2a4f16c9ec3e0d1 100644
--- a/paddle/capi/main.h
+++ b/paddle/capi/main.h
@@ -26,6 +26,13 @@ extern "C" {
  */
 PD_API paddle_error paddle_init(int argc, char** argv);
 
+/**
+ * Initialize the thread environment of Paddle.
+ * @note it is requisite for GPU runs but optional for CPU runs.
+ *       For GPU runs, all threads will run on the same GPU devices.
+ */
+PD_API paddle_error paddle_init_thread();
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h
index 01b8bad2ee9f528f8622346f43b9ff82225a7e73..8cc3e0034e058daefc63c69efe0b1f575c586897 100644
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -79,7 +79,7 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
  * @note  value should contain enough element of data to init the mat
  */
 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value);
+                                            paddle_real* value);
 
 /**
  * @brief PDMatGetRow Get raw row buffer from matrix
@@ -93,14 +93,14 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                           paddle_real** rawRowBuffer);
 
 /**
- * @brief copy data from the matrix 
+ * @brief copy data from the matrix
  * @param [in] mat Target matrix
- * @param [out] result pointer to store the matrix data 
+ * @param [out] result pointer to store the matrix data
  * @return paddle_error
  * @note the space of the result should allocated before invoke this API
  */
 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result);
+                                            paddle_real* result);
 /**
  * @brief PDMatCreateNone Create None Matrix
  * @return
diff --git a/paddle/capi/tests/test_predict_network.py b/paddle/capi/tests/test_predict_network.py
index 82ef5cb1a70398df65ace3c802076743c3ebe341..46a985d47652faa573d1b9817952a2b81d0db8fc 100644
--- a/paddle/capi/tests/test_predict_network.py
+++ b/paddle/capi/tests/test_predict_network.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=100)
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 89c1f48edacbe0a4432957fe066481412db7e6e1..88418062927cd0f7714e992cc2495109da45d32f 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -116,6 +116,7 @@ extern void hl_maxpool_backward(const int frameCnt,
  * @param[in]   paddingW    padding width.
  * @param[out]  tgtData     output data.
  * @param[in]   tgtStride   stride between output data samples.
+ * @param[in]   excludeMode whether to consider paddings for size.
  *
  */
 extern void hl_avgpool_forward(const int frameCnt,
@@ -132,7 +133,8 @@ extern void hl_avgpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride);
+                               const int tgtStride,
+                               bool excludeMode);
 
 /**
  * @brief   Maximum pool backward.
@@ -154,6 +156,7 @@ extern void hl_avgpool_forward(const int frameCnt,
  * @param[in]   scaleB      scale.
  * @param[out]  backGrad    output grad.
  * @param[in]   outStride   stride between output data samples.
+ * @param[in]   excludeMode whether to consider paddings for size.
  *
  */
 extern void hl_avgpool_backward(const int frameCnt,
@@ -172,7 +175,8 @@ extern void hl_avgpool_backward(const int frameCnt,
                                 real scaleA,
                                 real scaleB,
                                 real* backGrad,
-                                const int outStride);
+                                const int outStride,
+                                bool excludeMode);
 
 extern void hl_maxpool3D_forward(const int frameCnt,
                                  const real* inputData,
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 968ed4840ffb0623b57bd6e6d839973e109394de..706cc59a8e394b109d2b290425f4b5f51d987f28 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -68,7 +68,8 @@ inline void hl_avgpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride) {}
+                               const int tgtStride,
+                               const bool excludeMode) {}
 
 inline void hl_avgpool_backward(const int frameCnt,
                                 const real* outGrad,
@@ -86,7 +87,8 @@ inline void hl_avgpool_backward(const int frameCnt,
                                 real scaleA,
                                 real scaleB,
                                 real* backGrad,
-                                const int outStride) {}
+                                const int outStride,
+                                const bool excludeMode) {}
 
 inline void hl_maxpool3D_forward(const int frameCnt,
                                  const real* inputData,
diff --git a/paddle/cuda/src/avx_mathfun.h b/paddle/cuda/src/avx_mathfun.h
index 2412ed5abc13b2a83521a75524f581e106788b60..a0ba71faba9131f6f3f031c8276c3851090fac45 100644
--- a/paddle/cuda/src/avx_mathfun.h
+++ b/paddle/cuda/src/avx_mathfun.h
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 /*
    AVX implementation of sin, cos, sincos, exp and log
 
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index 3699b1e8ae9d8f813439eaeaa760c4a9f6e100a0..2d1bc4f6d55fac4b74f4e58d40fe56aa61d19cf9 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -210,7 +210,8 @@ __global__ void KeAvgPoolForward(const int nthreads,
                                  const int padH,
                                  const int padW,
                                  real* tgtData,
-                                 const int tgtStride) {
+                                 const int tgtStride,
+                                 const bool excludeMode) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
@@ -224,7 +225,8 @@ __global__ void KeAvgPoolForward(const int nthreads,
     int wend = min(wstart + sizeX, width);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
-    int pool_size = (hend - hstart) * (wend - wstart);
+    int poolSize =
+        excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
 
     real aveval = 0;
     inputData += (frameNum * channels + c) * height * width;
@@ -235,7 +237,7 @@ __global__ void KeAvgPoolForward(const int nthreads,
     }
     int tgtIndex =
         index % (pooledW * pooledH * channels) + frameNum * tgtStride;
-    tgtData[tgtIndex] = aveval / pool_size;
+    tgtData[tgtIndex] = aveval / poolSize;
   }
 }
 
@@ -253,7 +255,8 @@ void hl_avgpool_forward(const int frameCnt,
                         const int paddingH,
                         const int paddingW,
                         real* tgtData,
-                        const int tgtStride) {
+                        const int tgtStride,
+                        const bool excludeMode) {
   int num_kernels = pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
   KeAvgPoolForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
@@ -270,7 +273,8 @@ void hl_avgpool_forward(const int frameCnt,
                                                         paddingH,
                                                         paddingW,
                                                         tgtData,
-                                                        tgtStride);
+                                                        tgtStride,
+                                                        excludeMode);
   CHECK_SYNC("hl_avgpool_forward failed");
 }
 
@@ -290,7 +294,8 @@ __global__ void KeAvgPoolBackward(const int nthreads,
                                   real scaleA,
                                   real scaleB,
                                   real* tgtGrad,
-                                  const int outStride) {
+                                  const int outStride,
+                                  const bool excludeMode) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int offsetW = index % width + padW;
@@ -314,8 +319,9 @@ __global__ void KeAvgPoolBackward(const int nthreads,
         int wstart = pw * strideW - padW;
         int wend = min(wstart + sizeX, width);
         wstart = max(wstart, 0);
-        int poolsize = (hend - hstart) * (wend - wstart);
-        gradient += outGrad[ph * pooledW + pw] / poolsize;
+        int poolSize =
+            excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
+        gradient += outGrad[ph * pooledW + pw] / poolSize;
       }
     }
     tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient;
@@ -338,7 +344,8 @@ void hl_avgpool_backward(const int frameCnt,
                          real scaleA,
                          real scaleB,
                          real* backGrad,
-                         const int outStride) {
+                         const int outStride,
+                         const bool excludeMode) {
   int num_kernels = height * width * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
 
@@ -358,7 +365,8 @@ void hl_avgpool_backward(const int frameCnt,
                                                          scaleA,
                                                          scaleB,
                                                          backGrad,
-                                                         outStride);
+                                                         outStride,
+                                                         excludeMode);
   CHECK_SYNC("hl_avgpool_backward failed");
 }
 
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index c08e844847737b1172f6453767cc7f5e7b1a2bda..597ea959f230d88350796cef05b7d6f2a42e594a 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -5,8 +5,19 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 
-cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
+if (WITH_GPU)
+  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
+else()
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
+endif ()
+
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
+if (WITH_GPU)
+  nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor)
+else()
+  cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
+endif()
+
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
@@ -15,9 +26,18 @@ nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
 
 cc_test(variable_test SRCS variable_test.cc)
 
-cc_library(scope SRCS scope.cc DEPS glog)
+cc_library(threadpool SRCS threadpool.cc)
+cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
+
+cc_library(scope SRCS scope.cc DEPS glog threadpool)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 
+cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
+cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
+cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function)
+
+cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
+        framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
 
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
@@ -25,13 +45,14 @@ device_context)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
-cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute)
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference)
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
+    shape_inference data_transform lod_tensor)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
-cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
@@ -51,11 +72,15 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope frame
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
-
-cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
-cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
-
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
         proto_desc)
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
+
+cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator)
+cc_test(init_test SRCS init_test.cc DEPS init)
+
+cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
+cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
+nv_test(data_device_transform_test SRCS data_device_transform_test.cu
+        DEPS operator op_registry init math_function)
diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc
index b1e17936417e4ce09bace1d1a5d346d1c9cfa710..b0fd4d2750eb2529706d871947332d39494505cd 100644
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -19,42 +19,42 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
+Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
   switch (attr_desc.type()) {
-    case framework::AttrType::BOOLEAN: {
+    case proto::AttrType::BOOLEAN: {
       return attr_desc.b();
     }
-    case framework::AttrType::INT: {
+    case proto::AttrType::INT: {
       return attr_desc.i();
     }
-    case framework::AttrType::FLOAT: {
+    case proto::AttrType::FLOAT: {
       return attr_desc.f();
     }
-    case framework::AttrType::STRING: {
+    case proto::AttrType::STRING: {
       return attr_desc.s();
     }
-    case framework::AttrType::BOOLEANS: {
+    case proto::AttrType::BOOLEANS: {
       std::vector<bool> val(attr_desc.bools_size());
       for (int i = 0; i < attr_desc.bools_size(); ++i) {
         val[i] = attr_desc.bools(i);
       }
       return val;
     }
-    case framework::AttrType::INTS: {
+    case proto::AttrType::INTS: {
       std::vector<int> val(attr_desc.ints_size());
       for (int i = 0; i < attr_desc.ints_size(); ++i) {
         val[i] = attr_desc.ints(i);
       }
       return val;
     }
-    case framework::AttrType::FLOATS: {
+    case proto::AttrType::FLOATS: {
       std::vector<float> val(attr_desc.floats_size());
       for (int i = 0; i < attr_desc.floats_size(); ++i) {
         val[i] = attr_desc.floats(i);
       }
       return val;
     }
-    case framework::AttrType::STRINGS: {
+    case proto::AttrType::STRINGS: {
       std::vector<std::string> val(attr_desc.strings_size());
       for (int i = 0; i < attr_desc.strings_size(); ++i) {
         val[i] = attr_desc.strings(i);
diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
index 0641907d6ff7546df1601d3b0263ff42f4186968..c1c63d9cb13acb195b3bc3b30088f5fa7daf2a3d 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -27,12 +27,12 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 template <typename T>
-inline AttrType AttrTypeID() {
+inline proto::AttrType AttrTypeID() {
   Attribute tmp = T();
-  return static_cast<AttrType>(tmp.which() - 1);
+  return static_cast<proto::AttrType>(tmp.which() - 1);
 }
 
-Attribute GetAttrValue(const OpDesc::Attr& attr_desc);
+Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc);
 
 class AttrReader {
  public:
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index b9018ecdba8303fd6b37c87edd99e192aa604228..85e693434af863bfc3bde29989dbbfc69678d3b7 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/backward.h"
 #include "paddle/operators/net_op.h"
@@ -22,16 +22,27 @@
 
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/dynamic_recurrent_op.h"
 #include "paddle/operators/net_op.h"
 
 namespace paddle {
 namespace framework {
 
+static std::unordered_set<std::string>* g_ctrl_flow_ops_ = nullptr;
+// Control Flow operators's backward is significantly different from
+// computational operators. Hack Code here.
+// We should design a better way to backward CtrlFlowOps.
+static std::unordered_set<std::string>& CtrlFlowOps() {
+  if (g_ctrl_flow_ops_ == nullptr) {
+    g_ctrl_flow_ops_ = new std::unordered_set<std::string>{
+        "increment", "lod_rank_table", "less_than"};
+  }
+  return *g_ctrl_flow_ops_;
+}
+
 static inline std::unique_ptr<OperatorBase> CreateGradOp(
     const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
     std::unordered_map<std::string, std::string>* grad_to_var) {
-  OpDescBind op_desc;
+  OpDesc op_desc;
   op_desc.SetInputMap(op.Inputs());
   op_desc.SetOutputMap(op.Outputs());
   op_desc.SetType(op.Type());
@@ -42,7 +53,7 @@ static inline std::unique_ptr<OperatorBase> CreateGradOp(
   grad_ops.reserve(grad_descs.size());
   std::transform(grad_descs.begin(), grad_descs.end(),
                  std::back_inserter(grad_ops),
-                 [](const std::unique_ptr<OpDescBind>& grad_desc) {
+                 [](const std::unique_ptr<OpDesc>& grad_desc) {
                    return OpRegistry::CreateOp(*grad_desc);
                  });
   PADDLE_ENFORCE(!grad_ops.empty());
@@ -179,8 +190,9 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
       // collect all the offset for each alias,
       // insert a sum operator to add all aliases to output
       insert_position.push_back(
-          {dup_op.back(), OpRegistry::CreateOp("sum", {{"X", dup_outputs}},
-                                               {{"Out", {name}}}, {})});
+          {dup_op.back(),
+           OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, {{"Out", {name}}},
+                                AttributeMap{})});
     }
 
     // make sure the inserted `sum` ops follow the BFS order.
@@ -205,7 +217,8 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
         // If part of input gradient of that operator is not calculated, fill
         // zero variables to that input gradient.
         net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
-                                           {{"Y", {grad_input}}}, {}));
+                                           {{"Out", {grad_input}}},
+                                           AttributeMap{}));
       }
       return false;
     });
@@ -218,21 +231,6 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
                      return false;
                    });
 
-    // process recurrent gradient op as a special operator.
-    if (forwardOp.Type() == "dynamic_recurrent") {
-      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
-      // or this will result in infinite loop.
-      const auto& rnnop =
-          *static_cast<const operators::DynamicRecurrentOp*>(&forwardOp);
-      auto rnn_grad_op =
-          static_cast<operators::DynamicRecurrentGradientOp*>(grad_op.get());
-      const auto& stepnet_op =
-          *static_cast<const OperatorBase*>(&rnnop.rnn.GetStepUnit());
-      // create stepnet's gradient op
-      rnn_grad_op->rnn.SetStepUnit(
-          BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
-    }
-
     if (net->ops_.empty()) {  // Current no aux op is added to network
       return grad_op;
     }
@@ -298,18 +296,30 @@ static std::string FwdName(const std::string& grad_name) {
 static void CreateGradVarInBlock(
     size_t grad_op_start_index,
     const std::unordered_map<std::string, std::string>& param_name_map,
-    BlockDescBind* block_desc,
+    BlockDesc* block_desc,
     std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
   auto ops = block_desc->AllOps();
   for (size_t op_index = grad_op_start_index; op_index < ops.size();
        ++op_index) {
     std::unordered_set<std::string> new_vars;
+    auto& ctrl_flow_ops = CtrlFlowOps();
     ForEachVarName(ops[op_index]->Outputs(),
                    [&](const std::string& grad_var_name) {
-                     if (block_desc->HasVar(grad_var_name)) {
+                     if (ctrl_flow_ops.find(ops[op_index]->Type()) !=
+                         ctrl_flow_ops.end()) {
+                       if (block_desc->HasVarRecursive(grad_var_name)) {
+                         return false;
+                       }
+                     } else {
+                       if (block_desc->HasVar(grad_var_name)) {
+                         return false;
+                       }
+                     }
+                     if (grad_var_name == framework::kEmptyVarName) {
                        return false;
                      }
                      auto var = block_desc->Var(grad_var_name);
+                     VLOG(10) << "Creating Variable " << grad_var_name;
                      new_vars.insert(var->Name());
                      auto it = param_name_map.find(grad_var_name);
                      if (it == param_name_map.end()) {
@@ -331,7 +341,7 @@ static void CreateGradVarInBlock(
       auto* param = block_desc->FindVarRecursive(pname);
       auto* grad = block_desc->FindVar(arg);
       if (param == nullptr) {
-        grad->SetDataType(DataType::FP32);
+        grad->SetDataType(proto::DataType::FP32);
       } else {
         grad->SetDataType(param->GetDataType());
       }
@@ -340,23 +350,33 @@ static void CreateGradVarInBlock(
   }
 }
 
-std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
-    const OpDescBind* op_desc, std::unordered_set<std::string>* no_grad_vars,
+std::vector<std::unique_ptr<OpDesc>> MakeOpGrad(
+    const OpDesc* op_desc, std::unordered_set<std::string>* no_grad_vars,
     std::unordered_map<std::string, std::string>* grad_to_var,
-    const std::vector<BlockDescBind*>& grad_block =
-        std::vector<BlockDescBind*>()) {
-  std::vector<std::unique_ptr<OpDescBind>> grad_op_descs;
+    const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>()) {
+  std::vector<std::unique_ptr<OpDesc>> grad_op_descs;
   // All input gradients of forwarding operator do not need to calculate.
   const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
   if (AllGradInSet(inputs, *no_grad_vars)) {
+    VLOG(10) << "Drop operator  " << op_desc->Type();
     return grad_op_descs;  // empty vector
   }
+
   // All output gradients of forwarding operator do not need to calculate.
   const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
+
   if (AllGradInSet(outputs, *no_grad_vars)) {
-    for (const std::string& name : inputs) {
-      no_grad_vars->insert(GradVarName(name));
+    VLOG(10) << "Drop operator " << op_desc->Type();
+    // FIXME: Hack code here
+    auto& ctrl_flow_ops = CtrlFlowOps();
+    if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) {
+      // Only computational op need drop input's gradient.
+      for (const std::string& name : inputs) {
+        no_grad_vars->insert(GradVarName(name));
+        VLOG(10) << " Also drop " << GradVarName(name);
+      }
     }
+
     return grad_op_descs;  // empty vector
   }
 
@@ -365,7 +385,7 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
           .Get(op_desc->Type())
           .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block);
 
-  std::list<std::unique_ptr<OpDescBind>> pending_fill_zeros_ops;
+  std::list<std::unique_ptr<OpDesc>> pending_fill_zeros_ops;
   for (auto& desc : grad_op_descs) {
     for (const std::string& in_name : desc->InputArgumentNames()) {
       if (no_grad_vars->count(in_name)) {
@@ -373,8 +393,9 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
             0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
         std::string new_name = prefix + kZeroVarSuffix;
         desc->Rename(in_name, new_name);
-        std::unique_ptr<OpDescBind> fill_zeros_op(new OpDescBind(
-            "fill_zeros_like", {{"X", {prefix}}}, {{"Y", {new_name}}}, {}));
+        std::unique_ptr<OpDesc> fill_zeros_op(
+            new OpDesc("fill_zeros_like", {{"X", {prefix}}},
+                       {{"Out", {new_name}}}, AttributeMap{}));
         pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
       }
     }
@@ -386,36 +407,36 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
   return grad_op_descs;
 }
 
-static BlockDescBind* CreateStepBlock(
-    ProgramDescBind& program_desc,
-    std::unordered_set<std::string>* no_grad_vars,
+static BlockDesc* CreateStepBlock(
+    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
     std::unordered_map<std::string, std::string>* grad_to_var,
     int step_block_idx);
 
-std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
-    ProgramDescBind& program_desc, int block_idx,
+std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
+    ProgramDesc& program_desc, int block_idx,
     std::unordered_set<std::string>* no_grad_vars,
     std::unordered_map<std::string, std::string>* grad_to_var) {
   VLOG(5) << "MakeBlockBackward";
-  BlockDescBind* cur_block = program_desc.MutableBlock(block_idx);
-  std::vector<OpDescBind*> op_descs = cur_block->AllOps();
+  BlockDesc* cur_block = program_desc.MutableBlock(block_idx);
+  std::vector<OpDesc*> op_descs = cur_block->AllOps();
   std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
   size_t grad_desc_idx = 0;
-  std::vector<std::unique_ptr<OpDescBind>> backward_descs;
+  std::vector<std::unique_ptr<OpDesc>> backward_descs;
 
   for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
     VLOG(5) << "Making backward " << (*it)->Type() << " op";
-    std::vector<std::unique_ptr<OpDescBind>> op_grads;
+    std::vector<std::unique_ptr<OpDesc>> op_grads;
 
-    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
-      int step_block_idx = (*it)->GetBlockAttr("step_block");
-      BlockDescBind* backward_block = CreateStepBlock(
-          program_desc, no_grad_vars, grad_to_var, step_block_idx);
+    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while" ||
+        (*it)->Type() == "parallel_do") {
+      int step_block_idx = (*it)->GetBlockAttr("sub_block");
+      BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
+                                                  grad_to_var, step_block_idx);
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
     } else if ((*it)->Type() == "conditional_block") {
-      BlockDescBind* backward_block =
+      BlockDesc* backward_block =
           CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
-                          (*it)->GetBlockAttr("block"));
+                          (*it)->GetBlockAttr("sub_block"));
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
     } else {
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
@@ -441,14 +462,14 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
       }
       ++grad_desc_idx;
     }
-    std::transform(
-        op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
-        [](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
+    std::transform(op_grads.begin(), op_grads.end(),
+                   std::back_inserter(backward_descs),
+                   [](std::unique_ptr<OpDesc>& ptr) { return std::move(ptr); });
   }
 
   VLOG(5) << "Appending Sums";
   // Check whether some variables are written more than once
-  std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
+  std::list<std::pair<size_t, std::unique_ptr<OpDesc>>> pending_sum_ops;
   for (const auto& dup : dup_out_ops) {
     const std::string& out_name = dup.first;
     const std::vector<size_t> dup_op = dup.second;
@@ -464,17 +485,17 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
         sum_op_inputs.emplace_back(new_name);
         next_g_name = sum_op_inputs.back();
       }
-      std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
-          "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
+      std::unique_ptr<OpDesc> sum_op(new OpDesc("sum", {{"X", sum_op_inputs}},
+                                                {{"Out", {out_name}}},
+                                                AttributeMap{}));
       pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
     }
   }
 
-  pending_sum_ops.sort(
-      [](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
-         const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
-        return a.first > b.first;
-      });
+  pending_sum_ops.sort([](const std::pair<size_t, std::unique_ptr<OpDesc>>& a,
+                          const std::pair<size_t, std::unique_ptr<OpDesc>>& b) {
+    return a.first > b.first;
+  });
   for (auto& p : pending_sum_ops) {
     backward_descs.insert(backward_descs.begin() + p.first + 1,
                           std::move(p.second));
@@ -485,14 +506,13 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
   return backward_descs;
 }
 
-static BlockDescBind* CreateStepBlock(
-    ProgramDescBind& program_desc,
-    std::unordered_set<std::string>* no_grad_vars,
+static BlockDesc* CreateStepBlock(
+    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
     std::unordered_map<std::string, std::string>* grad_to_var,
     int step_block_idx) {
   auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
                                                    no_grad_vars, grad_to_var);
-  BlockDescBind* backward_block =
+  BlockDesc* backward_block =
       program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
   for (auto& ptr : backward_block_op_descs) {
     backward_block->AppendAllocatedOp(move(ptr));
@@ -501,7 +521,7 @@ static BlockDescBind* CreateStepBlock(
 }
 
 ParamGradInfoMap AppendBackward(
-    ProgramDescBind& program_desc, const VarDescBind& target,
+    ProgramDesc& program_desc, const VarDesc& target,
     const std::unordered_set<std::string>& no_grad_vars) {
   std::unordered_set<std::string> no_grad_var_names;
   no_grad_var_names.reserve(no_grad_vars.size() + 1);
@@ -518,11 +538,11 @@ ParamGradInfoMap AppendBackward(
   PADDLE_ENFORCE(is_scalar, "target should be scalar");
   VLOG(3) << "backward from loss=" << target.Name()
           << " data_type=" << target.GetDataType();
-  std::unique_ptr<OpDescBind> fill_one_op(
-      new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
-                     {{"shape", std::vector<int>{1}},
-                      {"value", static_cast<float>(1.0)},
-                      {"data_type", target.GetDataType()}}));
+  std::unique_ptr<OpDesc> fill_one_op(
+      new OpDesc("fill_constant", {}, {{"Out", {fill_one_op_out}}},
+                 {{"shape", std::vector<int>{1}},
+                  {"value", static_cast<float>(1.0)},
+                  {"dtype", target.GetDataType()}}));
   // infer var type of fill_one_op
   fill_one_op->InferVarType(root_block);
 
diff --git a/paddle/framework/backward.h b/paddle/framework/backward.h
index 96154fa82cb7a486aa4762ae633982ed6735220b..69ee3802369c16a8b21c0710d2008ef3c085cc5c 100644
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -49,7 +49,7 @@ using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
                                             GradVarInfo /*grad_var_info*/>;
 
 ParamGradInfoMap AppendBackward(
-    ProgramDescBind& program_desc, const VarDescBind& target,
+    ProgramDesc& program_desc, const VarDesc& target,
     const std::unordered_set<std::string>& no_grad_vars);
 
 }  // namespace framework
diff --git a/paddle/framework/backward.md b/paddle/framework/backward.md
deleted file mode 100644
index ac60be572419b62f4beb644ff192d413c35e19bb..0000000000000000000000000000000000000000
--- a/paddle/framework/backward.md
+++ /dev/null
@@ -1,100 +0,0 @@
-# Operator/expression 's Backward
-
-## Motivation
-
-In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. Hence we need a module that chains the gradient operators/expressions together to construct the backward pass. Every forward network needs a backward network to construct the full computation graph. The operator/expression's backward pass will be generated with respect to the forward pass. 
-
-## Implementation
-
-In this design doc, we exported only one API for generating the backward pass.
-
-```c++
-std::unique_ptr<OperatorBase> Backward(const OperatorBase& forwardOp,
-    const std::unordered_set<std::string>& no_grad_vars);
-```
-
-The implementation behind it can be divided into two parts, **Backward Operator Creating** and **Backward Operator Building**.
-
-### Backward Operator Registry
-
-A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs, and output gradients and then calculate its input gradients.
-
-|                        | forward operator | backward operator 
-| ---------------------- | ---------------- |------------------------- |		
-| **Operator::inputs_**  | Inputs       | Inputs, Outputs, OutputGradients |	
-| **Operator::outputs_** | Outputs          | InputGradients            |
-
- In most cases, there is a one-to-one relation between the forward and backward operators. These relations are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and to make operators pluggable, the registry mechanism is introduced.
-
-For example, we have `mul_op`, and we can register its information and corresponding backward operator by the following macro:
-
-```cpp
-REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
-```
-
-`mul` is the operator's type. `MulOp` and `MulOpMaker` are the operator class and the operator maker class respectively.
-
-`mul_grad` is the type of backward operator, and `MulOpGrad` is its class name.
-
-### Backward Opeartor Creating
-
-Given a certain forward operator, we can get its corresponding backward operator by calling:
-
-```cpp
-OperatorBase* bwd_op = BuildGradOp(const OperatorBase* fwd_op);
-```
-
-The function `BuildGradOp` will sequentially execute following processes:
-
-1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`.
-
-2. Build two maps named `inputs` and `outputs` to temporarily store backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these, are not necessary for gradient computing.
-
-3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`.
-
-4. Building backward operator with `inputs`, `outputs` and forward operator's attributes.
-
-### Backward Network Building
-
-A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and appending them together one by one. There are some corner cases that need special processing.
-
-1. Op 
-
-   When the input forward network is an Op, return its gradient Operator immediately. If all of its outputs are in no gradient set, then return a special `NOP`.
-
-2. NetOp 
-
-   In our design, the network itself is also a kind of operator(**NetOp**). So the operators contained by a big network may be some small network. When the input forward network is a NetOp, it needs to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to the forward NetOp.
-
-3. RnnOp
-
-   RnnOp is a nested stepnet operator.  Backward module needs to recusively call `Backward` for every stepnet.
-
-4. Sharing Variables
-
-   As illustrated in the figure 1 and figure 2, two operators share the same variable name **W@GRAD**, which will overwrite their shared input variable. 
-
-<p align="center">
-<img src="./images/duplicate_op.png" width="50%" ><br/>
-
-​	Figure 1. Sharing variables in operators. 
-
-</p>
-
-​	Sharing variable between operators or same input variable used in multiple operators can lead to duplicate gradient variables. As illustrated in figure 2, we need to rename the gradient names recursively and add a generic add operator to prevent overwriting. 
-
-<p align="center">
-<img src="images/duplicate_op2.png" width="40%" ><br/>
-
-​	Figure 2. Replace sharing variable's gradient with `Add` operator.
-
-</p>
-
-​	Because the framework finds variables according to their names, we need to rename the output links. We add an integer suffix to represent its position in the clockwise direction. 
-
-5. Part of the Gradient is Zero.
-
-   In the whole graph, there is some case of that one operator's gradient is not needed, but its input's gradient is a dependency link of other operator,  we need to fill a same shape gradient matrix in the position. In our implementation, we insert a special `fillZeroLike` operator.
-
-
-Follow these rules above, then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 2b858f5ea0874d7bf1a9cf38529f5d0d70cca7f2..72743b5fd0b32479ccbf28fbf98032df8fa371e9 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #include "paddle/framework/backward.h"
 
@@ -58,13 +58,13 @@ class RowWiseAddGradMaker : public SingleGradOpDescMaker {
   using SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<OpDescBind> Apply() const override {
-    auto grad_op = new OpDescBind();
+  std::unique_ptr<OpDesc> Apply() const override {
+    auto grad_op = new OpDesc();
     grad_op->SetInput(GradVarName("Out"), OutputGrad("Out"));
     grad_op->SetOutput(GradVarName("X"), InputGrad("X"));
     grad_op->SetOutput(GradVarName("b"), InputGrad("b"));
     grad_op->SetType("rowwise_add_grad");
-    return std::unique_ptr<OpDescBind>(grad_op);
+    return std::unique_ptr<OpDesc>(grad_op);
   }
 };
 
@@ -106,15 +106,15 @@ class FcOp : public operators::NetOp {
   FcOp(const std::string &type, const VariableNameMap &inputs,
        const VariableNameMap &outputs, const AttributeMap &attrs)
       : NetOp(type, inputs, outputs, attrs) {
-    AppendOp(OpRegistry::CreateOp("mul",
-                                  {{"X", {Input("X")}}, {"Y", {Input("W")}}},
-                                  {{"Out", {Output("mul_result")}}}, {}));
+    AppendOp(OpRegistry::CreateOp(
+        "mul", {{"X", {Input("X")}}, {"Y", {Input("W")}}},
+        {{"Out", {Output("mul_result")}}}, AttributeMap{}));
     auto input_b = Inputs("b");
     std::string before_act = "mul_result";
     if (input_b.size() != 0) {
       AppendOp(OpRegistry::CreateOp(
           "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
-          {{"Out", {Output("add_result")}}}, {}));
+          {{"Out", {Output("add_result")}}}, AttributeMap{}));
       before_act = "add_result";
     } else {
       auto out_varname = Output("add_result");
@@ -124,7 +124,7 @@ class FcOp : public operators::NetOp {
     }
 
     AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
-                                  {{"Out", {Output("Out")}}}, {}));
+                                  {{"Out", {Output("Out")}}}, AttributeMap{}));
     CompleteAddOp(false);
   }
 };
@@ -159,14 +159,14 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
   FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "x");
-    AddOutput("Y", "out");
+    AddOutput("Out", "out");
     AddComment("");
   }
 };
 
 class SumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SumOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "the input tensors of sum operator.").AsDuplicable();
     AddOutput("Out", "the output tensor of sum operator.");
@@ -190,11 +190,11 @@ class MinusGradOpDescMaker : public GradOpDescMakerBase {
  public:
   using GradOpDescMakerBase::GradOpDescMakerBase;
 
-  std::vector<std::unique_ptr<OpDescBind>> operator()() const override {
-    std::vector<std::unique_ptr<OpDescBind>> retv;
+  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
+    std::vector<std::unique_ptr<OpDesc>> retv;
     auto x_g = InputGrad("X");
     if (!x_g.empty()) {
-      auto *op_desc = new OpDescBind();
+      auto *op_desc = new OpDesc();
       op_desc->SetType("scale");
       op_desc->SetInput("X", OutputGrad("Out"));
       op_desc->SetOutput("Out", x_g);
@@ -204,7 +204,7 @@ class MinusGradOpDescMaker : public GradOpDescMakerBase {
 
     auto y_g = InputGrad("Y");
     if (!y_g.empty()) {
-      auto *op_desc = new OpDescBind();
+      auto *op_desc = new OpDesc();
       op_desc->SetType("scale");
       op_desc->SetInput("X", OutputGrad("Out"));
       op_desc->SetOutput("Out", y_g);
@@ -278,8 +278,9 @@ REGISTER_OPERATOR(scale, f::NoneOp);
 REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel<paddle::platform::CPUPlace, float>);
 
 TEST(Backward, simple_op_not_need_grad) {
-  auto fwd = f::OpRegistry::CreateOp(
-      "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
+  auto fwd =
+      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+                              {{"Out", {"out"}}}, f::AttributeMap{});
   ASSERT_NE(fwd, nullptr);
   auto gop = f::Backward(*fwd, {"x"});
   ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName);
@@ -296,9 +297,10 @@ TEST(Backward, net_fc_backward_normal) {
                               {{"mul_result", {"mul_res"}},
                                {"add_result", {"add_re"}},
                                {"Out", {"out"}}},
-                              {});
+                              f::AttributeMap{});
   ASSERT_NE(fwd, nullptr);
-  std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
+  std::shared_ptr<f::OperatorBase> gop =
+      f::Backward(*fwd, std::unordered_set<std::string>{});
   ASSERT_TRUE(gop->IsNetOp());
   auto net = static_cast<ops::NetOp *>(gop.get());
 
@@ -322,9 +324,10 @@ TEST(Backward, net_fc_backward_not_have_b) {
                               {{"mul_result", {"mul_res"}},
                                {"add_result", {"add_res"}},
                                {"Out", {"tmp"}}},
-                              {});
+                              f::AttributeMap{});
   ASSERT_NE(fwd, nullptr);
-  std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
+  std::shared_ptr<f::OperatorBase> gop =
+      f::Backward(*fwd, std::unordered_set<std::string>{});
   ASSERT_TRUE(gop->IsNetOp());
   auto net = static_cast<ops::NetOp *>(gop.get());
 
@@ -346,13 +349,13 @@ TEST(Backward, net_input_of_network_not_need_grad) {
       {{"mul_result", {"mul_tmp_0"}},
        {"add_result", {"add_tmp_0"}},
        {"Out", {"hidden0"}}},
-      {}));
+      f::AttributeMap{}));
   net.AppendOp(f::OpRegistry::CreateOp(
       "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
       {{"mul_result", {"mul_tmp_1"}},
        {"add_result", {"add_tmp_1"}},
        {"Out", {"hidden1"}}},
-      {}));
+      f::AttributeMap{}));
   net.CompleteAddOp();
   auto bwd = Backward(net, {"x"});  // x@GRAD is not need.
   ASSERT_TRUE(bwd->IsNetOp());
@@ -381,12 +384,13 @@ TEST(Backward, net_input_of_network_not_need_grad) {
 TEST(Backward, net_shared_weight) {
   ops::NetOp net;
   net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
-                                       {{"Out", {"out"}}}, {}));
+                                       {{"Out", {"out"}}}, f::AttributeMap{}));
   net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
-                                       {{"Out", {"FinalOut"}}}, {}));
+                                       {{"Out", {"FinalOut"}}},
+                                       f::AttributeMap{}));
   net.CompleteAddOp();
 
-  auto bwd = f::Backward(net, {});
+  auto bwd = f::Backward(net, std::unordered_set<std::string>{});
   ASSERT_TRUE(bwd->IsNetOp());
   auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
   ASSERT_EQ(3UL, bwd_net->ops_.size());
@@ -394,8 +398,9 @@ TEST(Backward, net_shared_weight) {
 }
 
 TEST(Backward, op_all_input_are_not_need) {
-  auto fwd = f::OpRegistry::CreateOp(
-      "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
+  auto fwd =
+      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+                              {{"Out", {"out"}}}, f::AttributeMap{});
   auto backward = f::Backward(*fwd, {"x", "b"});
   ASSERT_TRUE(backward->IsNetOp());
   auto net = static_cast<ops::NetOp *>(backward.get());
@@ -403,8 +408,9 @@ TEST(Backward, op_all_input_are_not_need) {
 }
 
 TEST(Backward, op_all_output_are_not_need) {
-  auto fwd = f::OpRegistry::CreateOp(
-      "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
+  auto fwd =
+      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+                              {{"Out", {"out"}}}, f::AttributeMap{});
   auto backward = f::Backward(*fwd, {"out"});
   ASSERT_TRUE(backward->IsNetOp());
   auto net = static_cast<ops::NetOp *>(backward.get());
@@ -412,8 +418,9 @@ TEST(Backward, op_all_output_are_not_need) {
 }
 
 TEST(Backward, op_part_of_output_are_not_need) {
-  auto fwd = f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
-                                     {{"y", {"Y"}}, {"z", {"Z"}}}, {});
+  auto fwd =
+      f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
+                              {{"y", {"Y"}}, {"z", {"Z"}}}, f::AttributeMap{});
   auto backward = f::Backward(*fwd, {"Z"});
   ASSERT_TRUE(backward->IsNetOp());
   auto net = static_cast<ops::NetOp *>(backward.get());
@@ -423,8 +430,8 @@ TEST(Backward, op_part_of_output_are_not_need) {
   ASSERT_EQ("fill_zeros_like", fill_zero.Type());
   ASSERT_EQ(1UL, fill_zero.Inputs("X").size());
   ASSERT_EQ("Z", fill_zero.Input("X"));
-  ASSERT_EQ(1UL, fill_zero.Outputs("Y").size());
-  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Y"));
+  ASSERT_EQ(1UL, fill_zero.Outputs("Out").size());
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Out"));
 
   auto &d_many_out = *net->ops_[1];
   ASSERT_EQ("many_output_op_grad", d_many_out.Type());
@@ -437,7 +444,7 @@ TEST(Backward, op_part_of_output_are_not_need) {
 
 TEST(Backward, op_part_of_input_are_not_need) {
   auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}},
-                                     {{"Out", {"out"}}}, {});
+                                     {{"Out", {"out"}}}, f::AttributeMap{});
   auto backward = f::Backward(*fwd, {"a"});
   auto &grad_mul = *backward;
   ASSERT_EQ(grad_mul.Type(), "mul_grad");
@@ -458,19 +465,19 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
       {{"mul_result", {"mul_out1"}},
        {"add_result", {"add_out1"}},
        {"Out", {"out1"}}},
-      {}));
+      f::AttributeMap{}));
   net.AppendOp(f::OpRegistry::CreateOp(
       "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
       {{"mul_result", {"mul_out2"}},
        {"add_result", {"tmp_out2"}},
        {"Out", {"out2"}}},
-      {}));
+      f::AttributeMap{}));
   net.AppendOp(f::OpRegistry::CreateOp(
       "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
       {{"mul_result", {"mul_out3"}},
        {"add_result", {"tmp_out3"}},
        {"Out", {"out3"}}},
-      {}));
+      f::AttributeMap{}));
   net.CompleteAddOp();
 
   auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
@@ -498,24 +505,25 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
 }
 
 TEST(Backward, simple_single_op) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
-  f::OpDescBind *op = block->AppendOp();
+  f::OpDesc *op = block->AppendOp();
   op->SetType("rowwise_add");
   op->SetInput("X", {"x"});
   op->SetInput("b", {"b"});
   op->SetOutput("Out", {"out"});
 
-  auto target = f::VarDescBind("out");
+  auto target = f::VarDesc("out");
   target.SetShape({1});
-  auto var_to_grad = AppendBackward(program, target, {});
+  auto var_to_grad =
+      AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 3UL);
-  f::OpDescBind *fill_op = block->AllOps()[1];
+  f::OpDesc *fill_op = block->AllOps()[1];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op = block->AllOps()[2];
+  f::OpDesc *grad_op = block->AllOps()[2];
   EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
@@ -535,63 +543,64 @@ TEST(Backward, simple_single_op) {
 }
 
 TEST(Backward, default_attribute) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op = block->AppendOp();
   op->SetType("mul");
   op->SetInput("X", {"x"});
   op->SetInput("Y", {"y"});
   op->SetOutput("Out", {"out"});
   op->CheckAttrs();
 
-  auto target = f::VarDescBind("out");
+  auto target = f::VarDesc("out");
   target.SetShape({1});
-  AppendBackward(program, target, {});
+  AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 3UL);
   EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
   EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);
 
-  f::OpDescBind *fill_op = block->AllOps()[1];
+  f::OpDesc *fill_op = block->AllOps()[1];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op = block->AllOps()[2];
+  f::OpDesc *grad_op = block->AllOps()[2];
   ASSERT_EQ(grad_op->Type(), "mul_grad");
   EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
   EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
 }
 
 TEST(Backward, simple_mult_op) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
   op1->SetType("rowwise_add");
   op1->SetInput("X", {"x1"});
   op1->SetInput("b", {"b1"});
   op1->SetOutput("Out", {"out1"});
 
-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
   op2->SetType("mul");
   op2->SetInput("X", {"out1"});
   op2->SetInput("Y", {"y2"});
   op2->SetOutput("Out", {"out2"});
 
-  f::OpDescBind *op3 = block->AppendOp();
+  f::OpDesc *op3 = block->AppendOp();
   op3->SetType("rowwise_add");
   op3->SetInput("X", {"out2"});
   op3->SetInput("b", {"b3"});
   op3->SetOutput("Out", {"out3"});
 
-  auto target = f::VarDescBind("out3");
+  auto target = f::VarDesc("out3");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {});
+  auto var_to_grad =
+      AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 6UL + 1);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op1 = block->AllOps()[6];
+  f::OpDesc *grad_op1 = block->AllOps()[6];
   EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -602,7 +611,7 @@ TEST(Backward, simple_mult_op) {
   EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
             std::vector<std::string>({f::GradVarName("b1")}));
 
-  f::OpDescBind *grad_op2 = block->AllOps()[5];
+  f::OpDesc *grad_op2 = block->AllOps()[5];
   EXPECT_EQ(grad_op2->Type(), "mul_grad");
   ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
   ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
@@ -616,7 +625,7 @@ TEST(Backward, simple_mult_op) {
   EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
             std::vector<std::string>({f::GradVarName("y2")}));
 
-  f::OpDescBind *grad_op3 = block->AllOps()[4];
+  f::OpDesc *grad_op3 = block->AllOps()[4];
   EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
@@ -646,42 +655,42 @@ TEST(Backward, simple_mult_op) {
 }
 
 TEST(Backward, intermedia_var_no_grad) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
   op1->SetType("rowwise_add");
   op1->SetInput("X", {"x1"});
   op1->SetInput("b", {"b1"});
   op1->SetOutput("Out", {"out1"});
 
-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
   op2->SetType("mul");
   op2->SetInput("X", {"x2"});
   op2->SetInput("Y", {"y2"});
   op2->SetOutput("Out", {"out2"});
 
-  f::OpDescBind *op3 = block->AppendOp();
+  f::OpDesc *op3 = block->AppendOp();
   op3->SetType("rowwise_add");
   op3->SetInput("X", {"out2"});
   op3->SetInput("b", {"b3"});
   op3->SetOutput("Out", {"out3"});
 
-  f::OpDescBind *op4 = block->AppendOp();
+  f::OpDesc *op4 = block->AppendOp();
   op4->SetType("mul");
   op4->SetInput("X", {"out1"});
   op4->SetInput("Y", {"out3"});
   op4->SetOutput("Out", {"out4"});
 
-  auto target = f::VarDescBind("out4");
+  auto target = f::VarDesc("out4");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {"out3"});
 
   ASSERT_EQ(block->AllOps().size(), 7UL);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op1 = block->AllOps()[6];
+  f::OpDesc *grad_op1 = block->AllOps()[6];
   EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -692,7 +701,7 @@ TEST(Backward, intermedia_var_no_grad) {
   EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
             std::vector<std::string>({f::GradVarName("b1")}));
 
-  f::OpDescBind *grad_op4 = block->AllOps()[5];
+  f::OpDesc *grad_op4 = block->AllOps()[5];
   EXPECT_EQ(grad_op4->Type(), "mul_grad");
   ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
   ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
@@ -717,32 +726,32 @@ TEST(Backward, intermedia_var_no_grad) {
 }
 
 TEST(Backward, var_no_grad) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
   op1->SetType("mult_in_out");
   op1->SetInput("X", {"x1"});
   op1->SetInput("H", {"h1"});
   op1->SetOutput("Y", {"y1"});
   op1->SetOutput("Z", {"z1"});
 
-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
   op2->SetType("mult_in_out");
   op2->SetInput("X", {"y1"});
   op2->SetInput("H", {"z1"});
   op2->SetOutput("Y", {"y2"});
   op2->SetOutput("Z", {"z2"});
 
-  auto target = f::VarDescBind("z2");
+  auto target = f::VarDesc("z2");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {"z1"});
 
   ASSERT_EQ(block->AllOps().size(), 6UL);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op2 = block->AllOps()[3];
+  f::OpDesc *grad_op2 = block->AllOps()[3];
   ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
   ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
   ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
@@ -758,15 +767,15 @@ TEST(Backward, var_no_grad) {
             std::vector<std::string>({f::GradVarName("y1")}));
   EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector<std::string>());
 
-  f::OpDescBind *fill_zero_op = block->AllOps()[4];
+  f::OpDesc *fill_zero_op = block->AllOps()[4];
   ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
   ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
   ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
   EXPECT_EQ(fill_zero_op->Input("X"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(fill_zero_op->Output("Y"),
+  EXPECT_EQ(fill_zero_op->Output("Out"),
             std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
 
-  f::OpDescBind *grad_op1 = block->AllOps()[5];
+  f::OpDesc *grad_op1 = block->AllOps()[5];
   ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
   ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
   ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -794,36 +803,37 @@ TEST(Backward, var_no_grad) {
 }
 
 TEST(Backward, shared_var) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
   op1->SetType("rowwise_add");
   op1->SetInput("X", {"x1"});
   op1->SetInput("b", {"b1"});
   op1->SetOutput("Out", {"out1"});
 
-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
   op2->SetType("mul");
   op2->SetInput("X", {"out1"});
   op2->SetInput("Y", {"y2"});
   op2->SetOutput("Out", {"out2"});
 
-  f::OpDescBind *op3 = block->AppendOp();
+  f::OpDesc *op3 = block->AppendOp();
   op3->SetType("rowwise_add");
   op3->SetInput("X", {"out1"});
   op3->SetInput("b", {"b3"});
   op3->SetOutput("Out", {"out3"});
 
-  auto target = f::VarDescBind("out3");
+  auto target = f::VarDesc("out3");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {});
+  auto var_to_grad =
+      AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 8UL);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op3 = block->AllOps()[4];
+  f::OpDesc *grad_op3 = block->AllOps()[4];
   ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
@@ -834,7 +844,7 @@ TEST(Backward, shared_var) {
   EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
             std::vector<std::string>({f::GradVarName("b3")}));
 
-  f::OpDescBind *grad_op4 = block->AllOps()[5];
+  f::OpDesc *grad_op4 = block->AllOps()[5];
   ASSERT_EQ(grad_op4->Type(), "mul_grad");
   ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
   ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
@@ -848,7 +858,7 @@ TEST(Backward, shared_var) {
   EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
             std::vector<std::string>({f::GradVarName("y2")}));
 
-  f::OpDescBind *sum_op = block->AllOps()[6];
+  f::OpDesc *sum_op = block->AllOps()[6];
   ASSERT_EQ(sum_op->Type(), "sum");
   ASSERT_EQ(sum_op->InputNames().size(), 1UL);
   ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
@@ -858,7 +868,7 @@ TEST(Backward, shared_var) {
   EXPECT_EQ(sum_op->Output("Out"),
             std::vector<std::string>({f::GradVarName("out1")}));
 
-  f::OpDescBind *grad_op1 = block->AllOps()[7];
+  f::OpDesc *grad_op1 = block->AllOps()[7];
   ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -885,19 +895,19 @@ TEST(Backward, shared_var) {
 }
 
 TEST(Backward, half_backward) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
   auto *op1 = block->AppendOp();
   op1->SetType("minus");
   op1->SetInput("X", {"a"});
   op1->SetInput("Y", {"b"});
   op1->SetOutput("Out", {"out"});
 
-  auto target = f::VarDescBind("out");
+  auto target = f::VarDesc("out");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {"b"});
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
   auto ops = block->AllOps();
   ASSERT_EQ(3UL, ops.size());
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index 11764810e1d40e5e6eb3cd0d8e9b4b63a79855b4..54498e175dacfa0a220e3d839f4feb02502b2c03 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -13,23 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/block_desc.h"
+#include "paddle/framework/operator.h"
 #include "paddle/framework/program_desc.h"
 
 namespace paddle {
 namespace framework {
 
-VarDescBind *BlockDescBind::Var(const std::string &name) {
+VarDesc *BlockDesc::Var(const std::string &name) {
   auto it = vars_.find(name);
   if (it != vars_.end()) {
     return it->second.get();
   }
   need_update_ = true;
-  auto *var = new VarDescBind(name);
+  auto *var = new VarDesc(name);
   vars_[name].reset(var);
   return var;
 }
 
-VarDescBind *BlockDescBind::FindVar(const std::string &name) const {
+VarDesc *BlockDesc::FindVar(const std::string &name) const {
   auto it = vars_.find(name);
   if (it == vars_.end()) {
     return nullptr;
@@ -37,11 +38,13 @@ VarDescBind *BlockDescBind::FindVar(const std::string &name) const {
   return it->second.get();
 }
 
-bool BlockDescBind::HasVar(const std::string &name) const {
+bool BlockDesc::HasVar(const std::string &name) const {
   return vars_.find(name) != vars_.end();
 }
 
-VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
+VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const {
+  if (name == kEmptyVarName) return nullptr;
+
   auto it = vars_.find(name);
   if (it == vars_.end()) {
     return Parent() == kNoneBlockIndex ? nullptr
@@ -50,53 +53,67 @@ VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
   return it->second.get();
 }
 
-VarDescBind *BlockDescBind::FindRecursiveOrCreateVar(
-    const std::string &name_bytes) {
-  VarDescBind *res = FindVarRecursive(name_bytes);
+VarDesc &BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) {
+  VarDesc *res = FindVarRecursive(name_bytes);
   if (res == nullptr) {
     res = Var(name_bytes);
   }
-  return res;
+  return *res;
 }
 
-bool BlockDescBind::HasVarRecursive(const std::string &name) const {
+bool BlockDesc::HasVarRecursive(const std::string &name) const {
   return FindVarRecursive(name) != nullptr;
 }
 
-std::vector<VarDescBind *> BlockDescBind::AllVars() const {
-  std::vector<VarDescBind *> res;
+std::vector<VarDesc *> BlockDesc::AllVars() const {
+  std::vector<VarDesc *> res;
   for (const auto &p : vars_) {
     res.push_back(p.second.get());
   }
   return res;
 }
 
-OpDescBind *BlockDescBind::AppendOp() {
+OpDesc *BlockDesc::AppendOp() {
   need_update_ = true;
-  ops_.emplace_back(new OpDescBind());
+  ops_.emplace_back(new OpDesc());
   return ops_.back().get();
 }
 
-void BlockDescBind::AppendAllocatedOp(std::unique_ptr<OpDescBind> &&op_desc) {
+void BlockDesc::AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {
   need_update_ = true;
   ops_.emplace_back(std::move(op_desc));
 }
 
-OpDescBind *BlockDescBind::PrependOp() {
+OpDesc *BlockDesc::PrependOp() {
   need_update_ = true;
-  ops_.emplace_front(new OpDescBind());
+  ops_.emplace_front(new OpDesc());
   return ops_.front().get();
 }
 
-std::vector<OpDescBind *> BlockDescBind::AllOps() const {
-  std::vector<OpDescBind *> res;
+void BlockDesc::RemoveOp(size_t s, size_t e) {
+  if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
+    return;
+  }
+  need_update_ = true;
+  for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) {
+    auto names = (*it)->InputArgumentNames();
+    for (auto n : names) {
+      // TODO(typhoonzero): delete vars if no other op use it.
+      VLOG(3) << "deleting var " << n;
+    }
+  }
+  ops_.erase(ops_.begin() + s, ops_.begin() + e);
+}
+
+std::vector<OpDesc *> BlockDesc::AllOps() const {
+  std::vector<OpDesc *> res;
   for (const auto &op : ops_) {
     res.push_back(op.get());
   }
   return res;
 }
 
-void BlockDescBind::Flush() {
+void BlockDesc::Flush() {
   for (auto &op_desc : ops_) {
     op_desc->Flush();
   }
@@ -118,43 +135,43 @@ void BlockDescBind::Flush() {
   }
 }
 
-BlockDescBind *BlockDescBind::ParentBlock() const {
+BlockDesc *BlockDesc::ParentBlock() const {
   if (this->desc_->parent_idx() == kNoneBlockIndex) {
     return nullptr;
   }
   return prog_->MutableBlock(static_cast<size_t>(this->desc_->parent_idx()));
 }
 
-BlockDesc *BlockDescBind::Proto() {
+proto::BlockDesc *BlockDesc::Proto() {
   Flush();
   return desc_;
 }
 
-BlockDescBind::BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
+BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
     : prog_(prog), desc_(desc), need_update_(false) {
-  for (const VarDesc &var_desc : desc_->vars()) {
-    vars_[var_desc.name()].reset(new VarDescBind(var_desc));
+  for (const proto::VarDesc &var_desc : desc_->vars()) {
+    vars_[var_desc.name()].reset(new VarDesc(var_desc));
   }
-  for (const OpDesc &op_desc : desc_->ops()) {
-    ops_.emplace_back(new OpDescBind(op_desc, prog));
+  for (const proto::OpDesc &op_desc : desc_->ops()) {
+    ops_.emplace_back(new OpDesc(op_desc, prog));
   }
 }
 
-BlockDescBind::BlockDescBind(const BlockDescBind &other, BlockDesc *desc,
-                             ProgramDescBind *prog)
+BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
+                     ProgramDesc *prog)
     : prog_(prog), desc_(desc) {
   need_update_ = true;
   for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDescBind(*op));
+    ops_.emplace_back(new OpDesc(*op));
   }
 
   for (auto &it : other.vars_) {
-    auto *var = new VarDescBind(*it.second);
+    auto *var = new VarDesc(*it.second);
     vars_[it.first].reset(var);
   }
 }
 
-void BlockDescBind::ClearPBOps() {
+void BlockDesc::ClearPBOps() {
   auto ops = this->desc_->mutable_ops();
   while (!ops->empty()) {
     // we do not own the OpDesc, so release the ownership.
@@ -162,7 +179,7 @@ void BlockDescBind::ClearPBOps() {
   }
 }
 
-void BlockDescBind::ClearPBVars() {
+void BlockDesc::ClearPBVars() {
   auto vars = this->desc_->mutable_vars();
   while (!vars->empty()) {
     // we do not own the VarDesc, so release the ownership.
diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
index 8e967e5378eb47a7869efb59cc96a271f1cbb9a1..4b609e4bcb67bb8dda5924a639e7a8165eda4353 100644
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -28,20 +28,19 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class ProgramDescBind;
+class ProgramDesc;
 
 // Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
 // read/write speed. Only when we want the protobuf message, the local changes
 // will be synchronized (by `Sync` method).
 
-class BlockDescBind {
+class BlockDesc {
  public:
-  BlockDescBind(ProgramDescBind *prog, BlockDesc *desc);
+  BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc);
 
-  BlockDescBind(const BlockDescBind &other, BlockDesc *desc,
-                ProgramDescBind *prog);
+  BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog);
 
-  ~BlockDescBind() {
+  ~BlockDesc() {
     this->ClearPBVars();
     this->ClearPBOps();
   }
@@ -50,15 +49,15 @@ class BlockDescBind {
 
   int32_t Parent() const { return desc_->parent_idx(); }
 
-  VarDescBind *Var(const std::string &name_bytes);
+  VarDesc *Var(const std::string &name_bytes);
 
-  VarDescBind *FindVar(const std::string &name_bytes) const;
+  VarDesc *FindVar(const std::string &name_bytes) const;
 
   bool HasVar(const std::string &var_name) const;
 
-  VarDescBind *FindVarRecursive(const std::string &name_bytes) const;
+  VarDesc *FindVarRecursive(const std::string &name_bytes) const;
 
-  VarDescBind *FindRecursiveOrCreateVar(const std::string &name_bytes);
+  VarDesc &FindRecursiveOrCreateVar(const std::string &name_bytes);
 
   bool HasVarRecursive(const std::string &var_name) const;
 
@@ -70,41 +69,43 @@ class BlockDescBind {
     return var_names;
   }
 
-  std::vector<VarDescBind *> AllVars() const;
+  std::vector<VarDesc *> AllVars() const;
 
-  BlockDescBind *ParentBlock() const;
+  BlockDesc *ParentBlock() const;
 
-  OpDescBind *AppendOp();
+  OpDesc *AppendOp();
 
-  void AppendAllocatedOp(std::unique_ptr<OpDescBind> &&op_desc);
+  void AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc);
 
-  OpDescBind *PrependOp();
+  OpDesc *PrependOp();
 
-  std::vector<OpDescBind *> AllOps() const;
+  void RemoveOp(size_t s, size_t e);
+
+  std::vector<OpDesc *> AllOps() const;
 
   size_t OpSize() const { return ops_.size(); }
 
-  OpDescBind *Op(int idx) { return ops_.at(idx).get(); }
+  OpDesc *Op(int idx) { return ops_.at(idx).get(); }
 
   void Flush();
 
-  BlockDesc *Proto();
+  proto::BlockDesc *Proto();
 
-  ProgramDescBind *Program() { return this->prog_; }
+  ProgramDesc *Program() { return this->prog_; }
 
  private:
   void ClearPBOps();
   void ClearPBVars();
 
  private:
-  ProgramDescBind *prog_;  // not_own
-  BlockDesc *desc_;        // not_own
+  ProgramDesc *prog_;       // not_own
+  proto::BlockDesc *desc_;  // not_own
   bool need_update_;
 
-  std::deque<std::unique_ptr<OpDescBind>> ops_;
-  std::unordered_map<std::string, std::unique_ptr<VarDescBind>> vars_;
+  std::deque<std::unique_ptr<OpDesc>> ops_;
+  std::unordered_map<std::string, std::unique_ptr<VarDesc>> vars_;
 
-  DISABLE_COPY_AND_ASSIGN(BlockDescBind);
+  DISABLE_COPY_AND_ASSIGN(BlockDesc);
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/data_device_transform.cc b/paddle/framework/data_device_transform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d38d87927fc7ee0aa32eabff5cf83d7c4ca7b2b0
--- /dev/null
+++ b/paddle/framework/data_device_transform.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/data_device_transform.h"
+
+namespace paddle {
+namespace framework {
+
+static const platform::DeviceContext* GetDeviceContext(
+    const platform::Place& src_place, const platform::Place& dst_place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+
+  if (platform::is_gpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    return pool.Get(src_place);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    return pool.Get(dst_place);
+  } else {
+    PADDLE_THROW(
+        "Currently, model parallelism is only supported between CPU and CUDA");
+  }
+}
+
+void DeviceTransform(const Tensor& in, const platform::Place& dst_place,
+                     Tensor* out) {
+  VLOG(3) << "DeviceTransform in, src_place " << in.place()
+          << " dst_place: " << dst_place;
+  auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
+  dev_ctx->Wait();
+  Copy(in, dst_place, *dev_ctx, out);
+  dev_ctx->Wait();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_device_transform.h b/paddle/framework/data_device_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..b21ed0be34ad868ae4a404e913e9623db308b530
--- /dev/null
+++ b/paddle/framework/data_device_transform.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+void DeviceTransform(const Tensor& in, const platform::Place& dst_place,
+                     Tensor* out);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_device_transform_test.cu b/paddle/framework/data_device_transform_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5d89f5546fa87241dec6364d86a100ca51bce687
--- /dev/null
+++ b/paddle/framework/data_device_transform_test.cu
@@ -0,0 +1,167 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_info.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/elementwise_op_function.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
+class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input1 of test op");
+    AddOutput("output", "output of test op");
+    AddAttr<bool>("use_gpu", "force to use gpu kernel").SetDefault(false);
+    AddComment("This is test op");
+  }
+};
+
+class TestOpWithKernel : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+  OpKernelType GetExpectedKernelType(
+      const ExecutionContext& ctx) const override {
+    if (Attr<bool>("use_gpu")) {
+      VLOG(3) << "force use gpu kernel";
+      return OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0));
+    } else {
+      VLOG(3) << "use default kernel";
+      return OpKernelType(proto::DataType::FP32,
+                          ctx.Input<Tensor>("input")->place());
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TestKernel : public OpKernel<float> {
+ public:
+  void Compute(const ExecutionContext& ctx) const {
+    std::cout << ctx.op().DebugString() << std::endl;
+
+    const Tensor* input = ctx.Input<Tensor>("input");
+
+    std::cout << "input place:" << input->place() << std::endl;
+    auto* output = ctx.Output<framework::LoDTensor>("output");
+    output->Resize(input->dims());
+    output->mutable_data<T>(ctx.GetPlace());
+
+    operators::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
+        input, input, output, ctx.template device_context<DeviceContext>(),
+        AddFunctor<T>());
+    functor.Run();
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(
+    test_op, paddle::framework::TestOpWithKernel,
+    paddle::framework::OpKernelTestProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(
+    test_op,
+    paddle::framework::TestKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    test_op,
+    paddle::framework::TestKernel<paddle::platform::CUDADeviceContext, float>);
+
+static void BuildVar(const std::string& param_name,
+                     std::initializer_list<const char*> arguments,
+                     paddle::framework::proto::OpDesc::Var* var) {
+  var->set_parameter(param_name);
+  for (auto& arg_name : arguments) {
+    *var->mutable_arguments()->Add() = arg_name;
+  }
+}
+
+TEST(Operator, CPUtoGPU) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  InitDevices();
+
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace cpu_place;
+
+  // create an op to run on CPU
+  paddle::framework::proto::OpDesc cpu_op_desc;
+  cpu_op_desc.set_type("test_op");
+  BuildVar("input", {"IN1"}, cpu_op_desc.add_inputs());
+  BuildVar("output", {"OUT1"}, cpu_op_desc.add_outputs());
+
+  auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc);
+  // prepare input
+  auto* in_t = scope.Var("IN1")->GetMutable<LoDTensor>();
+  auto* src_ptr = in_t->mutable_data<float>({2, 3}, CPUPlace());
+  for (int i = 0; i < 2 * 3; ++i) {
+    src_ptr[i] = static_cast<float>(i);
+  }
+
+  // get output
+  auto* output = scope.Var("OUT1");
+  cpu_op->Run(scope, cpu_place);
+
+  auto* output_ptr = output->Get<LoDTensor>().data<float>();
+  for (int i = 0; i < 2 * 3; ++i) {
+    ASSERT_EQ(output_ptr[i], static_cast<float>(i) * 2);
+  }
+
+  // create an op to run on GPU
+  paddle::framework::proto::OpDesc gpu_op_desc;
+  gpu_op_desc.set_type("test_op");
+  BuildVar("input", {"OUT1"}, gpu_op_desc.add_inputs());
+  BuildVar("output", {"OUT2"}, gpu_op_desc.add_outputs());
+
+  auto attr = gpu_op_desc.mutable_attrs()->Add();
+  attr->set_name("use_gpu");
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
+  attr->set_b(true);
+
+  auto gpu_op = paddle::framework::OpRegistry::CreateOp(gpu_op_desc);
+
+  paddle::platform::CUDAPlace cuda_place(0);
+  // get output
+  auto* output2 = scope.Var("OUT2");
+  gpu_op->Run(scope, cuda_place);
+
+  // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  auto dev_ctx = pool.Get(cuda_place);
+
+  paddle::framework::Tensor output_tensor;
+  Copy(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
+       &output_tensor);
+
+  dev_ctx->Wait();
+  float* output2_ptr = output_tensor.data<float>();
+  for (int i = 0; i < 2 * 3; ++i) {
+    ASSERT_EQ(output2_ptr[i], static_cast<float>(i) * 4);
+  }
+}
diff --git a/paddle/framework/data_layout.h b/paddle/framework/data_layout.h
new file mode 100644
index 0000000000000000000000000000000000000000..31817251ed09a7a1da7223c1e99b2eb369a3de30
--- /dev/null
+++ b/paddle/framework/data_layout.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cctype>
+#include <ostream>
+
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+enum class DataLayout {
+  kNHWC = 0,
+  kNCHW = 1,
+  kAnyLayout = 2,
+};
+
+inline DataLayout StringToDataLayout(const std::string& str) {
+  std::string s(str);
+  for (size_t i = 0; i < s.size(); ++i) {
+    s[i] = toupper(s[i]);
+  }
+
+  if (s == "NHWC") {
+    return DataLayout::kNHWC;
+  } else if (s == "NCHW") {
+    return DataLayout::kNCHW;
+  } else if (s == "ANYLAYOUT") {
+    return DataLayout::kAnyLayout;
+  } else {
+    PADDLE_THROW("Unknown storage order string: %s", s);
+  }
+}
+
+inline std::string DataLayoutToString(const DataLayout& data_layout) {
+  switch (data_layout) {
+    case DataLayout::kNHWC:
+      return "NHWC";
+    case DataLayout::kNCHW:
+      return "NCHW";
+    case DataLayout::kAnyLayout:
+      return "ANY_LAYOUT";
+    default:
+      PADDLE_THROW("unknown DataLayou %d", data_layout);
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& out, const DataLayout& l) {
+  out << DataLayoutToString(l);
+  return out;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_layout_transform.cc b/paddle/framework/data_layout_transform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..96794cae97d460e86fe83ac1395e1dfc7e371e3b
--- /dev/null
+++ b/paddle/framework/data_layout_transform.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/data_layout_transform.h"
+
+#include "paddle/framework/tensor.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace framework {
+
+struct CastDataLayout {
+  CastDataLayout(const platform::DeviceContext* ctx,
+                 const std::vector<int>& axis, const framework::Tensor& in,
+                 framework::Tensor* out)
+      : in_(in), out_(out), ctx_(ctx), axis_(axis) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+  const std::vector<int> axis_;
+
+  template <typename T>
+  void operator()() {
+    auto place = ctx_->GetPlace();
+
+    if (platform::is_cpu_place(place)) {
+      operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans4(*context, in_, out_, axis_);
+    } else {
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
+
+void TransDataLayout(const std::vector<int>& axis,
+                     const platform::DeviceContext* ctx,
+                     const KernelTypePair& kernel_pair, const Variable& in,
+                     Variable* out) {
+  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only support Tensor transform!.");
+  PADDLE_ENFORCE(
+      platform::places_are_same_class(kernel_pair.first.place_,
+                                      kernel_pair.second.place_),
+      "TransDataLayout only support DataLayout transform on same place!");
+  PADDLE_ENFORCE(kernel_pair.first.data_type_ == kernel_pair.second.data_type_,
+                 "TransDataLayout only support Datatype are same!");
+
+  auto src = in.Get<Tensor>();
+  auto* dst = out->GetMutable<Tensor>();
+  PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!");
+
+  auto src_dim = src.dims();
+  std::vector<int64_t> dst_dim;
+
+  dst_dim.resize(axis.size());
+  for (size_t i = 0; i < axis.size(); i++) {
+    dst_dim[i] = src_dim[axis[i]];
+  }
+
+  dst->Resize(make_ddim(dst_dim));
+  auto place = kernel_pair.second.place_;
+  dst->mutable_data(place, src.type());
+
+  auto src_type = kernel_pair.first.data_type_;
+  framework::VisitDataType(src_type, CastDataLayout(ctx, axis, src, dst));
+
+  dst->set_layout(kernel_pair.second.data_layout_);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_layout_transform.h b/paddle/framework/data_layout_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..befae1f63616a4c21d998c6b784b8ef288d00617
--- /dev/null
+++ b/paddle/framework/data_layout_transform.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_kernel_type.h"
+#include "paddle/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+
+using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
+
+void TransDataLayout(const std::vector<int>& axis,
+                     const platform::DeviceContext* ctx,
+                     const KernelTypePair& kernel_pair, const Variable& in,
+                     Variable* out);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d826f0edace6d5afee5cd83f6e65d6dbaefae874
--- /dev/null
+++ b/paddle/framework/data_transform.cc
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/data_transform.h"
+
+#include "paddle/framework/data_device_transform.h"
+
+namespace paddle {
+namespace framework {
+
+void DataTransform(const OpKernelType& expected_kernel_type,
+                   const OpKernelType& kernel_type_for_var,
+                   const Tensor& input_tensor, Tensor* out) {
+  if (!platform::is_same_place(kernel_type_for_var.place_,
+                               expected_kernel_type.place_)) {
+    DeviceTransform(input_tensor, expected_kernel_type.place_, out);
+  }
+  PADDLE_ENFORCE_NOT_NULL(out, "out should not be null");
+}
+
+void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
+                            Variable& out_var) {
+  if (in_var.IsType<LoDTensor>()) {
+    auto& in_lod_tensor = in_var.Get<LoDTensor>();
+    auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
+    tran_lod_tensor->set_lod(in_lod_tensor.lod());
+    tran_lod_tensor->set_layout(in_lod_tensor.layout());
+    tran_lod_tensor->ShareDataWith(tensor);
+  } else if (in_var.IsType<SelectedRows>()) {
+    auto& in_selected_rows = in_var.Get<SelectedRows>();
+    auto* trans_selected_rows = out_var.GetMutable<SelectedRows>();
+    trans_selected_rows->set_height(in_selected_rows.height());
+    trans_selected_rows->set_rows(in_selected_rows.rows());
+    trans_selected_rows->mutable_value()->ShareDataWith(tensor);
+  } else {
+    PADDLE_THROW("unknown var type");
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_transform.h b/paddle/framework/data_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4b78902379d5eb89f92ec47655ff93b49d0bfab
--- /dev/null
+++ b/paddle/framework/data_transform.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "paddle/framework/op_kernel_type.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/framework/variable.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/macros.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace framework {
+
+void DataTransform(const OpKernelType& expected_kernel_type,
+                   const OpKernelType& kernel_type_for_var,
+                   const Tensor& input_tensor, Tensor* out);
+
+void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
+                            Variable& out_var);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index c54d2d4ddf09c445fb25c1fbe8a7498f233d8212..6a372ac32e48131eed28e2d42125feb5b92a11c7 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <typeindex>
@@ -20,7 +20,8 @@
 namespace paddle {
 namespace framework {
 
-inline DataType ToDataType(std::type_index type) {
+inline proto::DataType ToDataType(std::type_index type) {
+  using namespace paddle::framework::proto;
   if (typeid(float).hash_code() == type.hash_code()) {
     return DataType::FP32;
   } else if (typeid(double).hash_code() == type.hash_code()) {
@@ -36,7 +37,8 @@ inline DataType ToDataType(std::type_index type) {
   }
 }
 
-inline std::type_index ToTypeIndex(DataType type) {
+inline std::type_index ToTypeIndex(proto::DataType type) {
+  using namespace paddle::framework::proto;
   switch (type) {
     case DataType::FP32:
       return typeid(float);
@@ -54,7 +56,8 @@ inline std::type_index ToTypeIndex(DataType type) {
 }
 
 template <typename Visitor>
-inline void VisitDataType(DataType type, Visitor visitor) {
+inline void VisitDataType(proto::DataType type, Visitor visitor) {
+  using namespace paddle::framework::proto;
   switch (type) {
     case DataType::FP32:
       visitor.template operator()<float>();
diff --git a/paddle/framework/data_type_transform.cc b/paddle/framework/data_type_transform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..63373232e910d44eb0996f9280f9c166ad092030
--- /dev/null
+++ b/paddle/framework/data_type_transform.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/data_type_transform.h"
+
+#include "paddle/framework/selected_rows.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename InType, typename OutType>
+struct CastDataTypeFunctor {
+  HOSTDEVICE inline OutType operator()(InType in) const {
+    return static_cast<OutType>(in);
+  }
+};
+
+template <typename InType>
+struct CastDataType {
+  CastDataType(const framework::Tensor& in, framework::Tensor* out,
+               const platform::DeviceContext* ctx)
+      : in_(in), out_(out), ctx_(ctx) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+
+  template <typename OutType>
+  void operator()() {
+    auto place = ctx_->GetPlace();
+
+    auto* in_begin = in_.data<InType>();
+    auto numel = in_.numel();
+    auto* in_end = in_begin + numel;
+    auto* out_begin = out_->mutable_data<OutType>(place);
+
+    if (platform::is_cpu_place(place)) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+    } else {
+      // TODO(dzhwinter): enhance Copy CPU<->GPU with different data type?
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
+
+void TransDataType(const platform::DeviceContext* ctx,
+                   const KernelTypePair& kernel_pair, const Variable& in,
+                   Variable* out) {
+  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
+  PADDLE_ENFORCE(
+      platform::places_are_same_class(kernel_pair.first.place_,
+                                      kernel_pair.second.place_),
+      "TransDataType Only Support DataType transform on same place!");
+
+  auto src = in.Get<Tensor>();
+  auto* dst = out->GetMutable<Tensor>();
+
+  auto dims = src.dims();
+  dst->Resize(dims);
+  auto dst_type = kernel_pair.second.data_type_;
+  auto src_type = kernel_pair.first.data_type_;
+
+  switch (src_type) {
+    case proto::DataType::FP32:
+      framework::VisitDataType(dst_type, CastDataType<float>(src, dst, ctx));
+      break;
+    case proto::DataType::FP64:
+      framework::VisitDataType(dst_type, CastDataType<double>(src, dst, ctx));
+      break;
+    case proto::DataType::INT32:
+      framework::VisitDataType(dst_type, CastDataType<int>(src, dst, ctx));
+      break;
+    case proto::DataType::INT64:
+      framework::VisitDataType(dst_type, CastDataType<int64_t>(src, dst, ctx));
+      break;
+    case proto::DataType::BOOL:
+      framework::VisitDataType(dst_type, CastDataType<bool>(src, dst, ctx));
+      break;
+    default:
+      PADDLE_THROW("Not support type %d", src_type);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_type_transform.h b/paddle/framework/data_type_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ec90742256c2308a242d993838e46e51a6fc167
--- /dev/null
+++ b/paddle/framework/data_type_transform.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_kernel_type.h"
+#include "paddle/framework/variable.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
+
+void TransDataType(const platform::DeviceContext* ctx,
+                   const KernelTypePair& kernel_pair, const Variable& in,
+                   Variable* out);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc
index 756232b1b56a49d2c91cc2cac950ca508c54fb3f..bc259d1f603fb34ac8546c388669d8c5c1250bd1 100644
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -1,3 +1,16 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <sstream>
 #include <vector>
 
diff --git a/paddle/framework/details/cow_ptr.h b/paddle/framework/details/cow_ptr.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e308ffb5a49876aa2c1833b3b7e2a2c7eb137aa
--- /dev/null
+++ b/paddle/framework/details/cow_ptr.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <memory>
+#include <thread>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+// Change it to thread safe flags if needed.
+class ThreadUnsafeOwnershipFlags {
+ public:
+  ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
+
+  ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
+  ThreadUnsafeOwnershipFlags& operator=(
+      const ThreadUnsafeOwnershipFlags& other) = delete;
+  ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
+
+  void SetOwnership(bool flag) { flag_ = flag; }
+
+  // Invoke the callback if it is not owned.
+  template <typename Callback>
+  void AcquireOwnershipOnce(Callback acquire) {
+    if (!flag_) {
+      acquire();
+      flag_ = true;
+    }
+  }
+
+ private:
+  bool flag_;
+};
+
+// Copy-On-Write pointer.
+// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
+//
+// The template parameter OwnershipFlags should have:
+//   * a constructor takes a bool. True if own.
+//   * SetOwnership(bool flag).
+//   * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
+//     owned.
+//
+// https://en.wikipedia.org/wiki/Copy-on-write
+template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
+class COWPtr {
+ public:
+  // Ctor from raw pointer.
+  explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}
+
+  // Move methods. Steal ownership from origin
+  COWPtr(COWPtr&& other)
+      : payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
+  COWPtr& operator=(COWPtr&& origin) = default;
+
+  // Copy methods. Not own payload
+  COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
+  COWPtr& operator=(const COWPtr& other) {
+    payload_ = other.payload_;
+    ownership_.SetOwnership(false);
+    return *this;
+  }
+
+  // Access read only data.
+  const T& Data() const { return *payload_; }
+
+  // Access mutable data. If the data is not owned, the data will be copied
+  // before.
+  T* MutableData() {
+    ownership_.AcquireOwnershipOnce(
+        [this] { payload_.reset(new T(*payload_)); });
+    return payload_.get();
+  }
+
+ private:
+  // Actual data pointer.
+  std::shared_ptr<T> payload_;
+
+  // Ownership flag.
+  OwnershipFlags ownership_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/operators/seq_expand_op.cu b/paddle/framework/details/cow_ptr_test.cc
similarity index 50%
rename from paddle/operators/seq_expand_op.cu
rename to paddle/framework/details/cow_ptr_test.cc
index f1e4b82a76e628c4d9fb83bc93f3dcfd2f98ea5b..936954a2333e7e5d2a932abad641279db9ef7b9f 100644
--- a/paddle/operators/seq_expand_op.cu
+++ b/paddle/framework/details/cow_ptr_test.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -12,12 +12,24 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
-#include "paddle/operators/seq_expand_op.h"
+#include "paddle/framework/details/cow_ptr.h"
+#include "gtest/gtest.h"
 
-namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(seq_expand,
-                       ops::SeqExpandKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    seq_expand_grad,
-    ops::SeqExpandGradKernel<paddle::platform::GPUPlace, float>);
+namespace paddle {
+namespace framework {
+namespace details {
+
+TEST(COWPtr, all) {
+  COWPtr<int> ptr(new int{0});
+  ASSERT_EQ(ptr.Data(), 0);
+  COWPtr<int> ptr2 = ptr;
+  ASSERT_EQ(ptr2.Data(), 0);
+  ASSERT_EQ(&ptr2.Data(), &ptr.Data());
+  *ptr2.MutableData() = 10;
+  ASSERT_EQ(ptr.Data(), 0);
+  ASSERT_EQ(ptr2.Data(), 10);
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/details/op_registry.h b/paddle/framework/details/op_registry.h
index f91e0e03410c95f84a65f02beed38b7bbfdcaa86..6d50e820b2b625f932768d2ca671d999071f1ca6 100644
--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -90,7 +90,7 @@ struct OpInfoFiller<T, kOperator> {
 template <typename T>
 struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
   void operator()(const char* op_type, OpInfo* info) const {
-    info->proto_ = new OpProto;
+    info->proto_ = new proto::OpProto;
     info->checker_ = new OpAttrChecker();
     auto maker = T(info->proto_, info->checker_);
     maker.Validate();
@@ -106,10 +106,10 @@ template <typename T>
 struct OpInfoFiller<T, kGradOpDescMaker> {
   void operator()(const char* op_type, OpInfo* info) const {
     info->grad_op_maker_ = [](
-        const OpDescBind& fwd_op,
+        const OpDesc& fwd_op,
         const std::unordered_set<std::string>& no_grad_set,
         std::unordered_map<std::string, std::string>* grad_to_var,
-        const std::vector<BlockDescBind*>& grad_block) {
+        const std::vector<BlockDesc*>& grad_block) {
       T maker(fwd_op, no_grad_set, grad_to_var, grad_block);
       return maker();
     };
@@ -119,7 +119,7 @@ struct OpInfoFiller<T, kGradOpDescMaker> {
 template <typename T>
 struct OpInfoFiller<T, kVarTypeInference> {
   void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_var_type_ = [](const OpDescBind& fwd_op, BlockDescBind* block) {
+    info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) {
       T inference;
       inference(fwd_op, block);
     };
diff --git a/paddle/framework/dim.h b/paddle/framework/dim.h
index 04d4b0e604e6f73ad94e0ca79d6b69f663bd4076..ec17d7c6156351d21a4f9431f85fb0bcf00e4331 100644
--- a/paddle/framework/dim.h
+++ b/paddle/framework/dim.h
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #pragma once
 
 #include <iostream>
diff --git a/paddle/framework/dim_test.cu b/paddle/framework/dim_test.cu
index 0a6a87669c900de6cb507dd48f0cfc871defe279..2bcab7c5c2e454e86a148fde003164d369c46ef2 100644
--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #include <thrust/device_vector.h>
 #include <sstream>
 
diff --git a/paddle/framework/eigen_test.cc b/paddle/framework/eigen_test.cc
index bc4a2db32cfba66bef2c444e1f822e0d2a57b91e..c6f77ecfabdddbfdd1df646a1c9310c03930bd8e 100644
--- a/paddle/framework/eigen_test.cc
+++ b/paddle/framework/eigen_test.cc
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 /*
   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index adedd8cb0e8504fd6fc924e62a2ede3c1c7ce698..c0418c9266e257bd7567861543e557f354451b17 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -14,18 +14,18 @@ limitations under the License. */
 
 #include "paddle/framework/executor.h"
 
-#include <algorithm>
-#include <iostream>
-#include <memory>
 #include <set>
-#include <vector>
 
+#include "gflags/gflags.h"
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/framework/scope.h"
+#include "paddle/platform/place.h"
+
+DEFINE_bool(check_nan_inf, false,
+            "Checking whether operator produce NAN/INF or not. It will be "
+            "extremely slow so please use this flag wisely.");
 
 namespace paddle {
 namespace framework {
@@ -33,103 +33,104 @@ namespace framework {
 const std::string kFeedOpType = "feed";
 const std::string kFetchOpType = "fetch";
 
-Executor::Executor(const std::vector<platform::Place>& places) : own_(true) {
-  PADDLE_ENFORCE_GT(places.size(), 0);
-  device_contexts_.resize(places.size());
-  for (size_t i = 0; i < places.size(); i++) {
-    if (platform::is_cpu_place(places[i])) {
-      device_contexts_[i] = new platform::CPUDeviceContext(
-          boost::get<platform::CPUPlace>(places[i]));
-    } else if (platform::is_gpu_place(places[i])) {
-#ifdef PADDLE_WITH_CUDA
-      device_contexts_[i] = new platform::CUDADeviceContext(
-          boost::get<platform::GPUPlace>(places[i]));
-#else
-      PADDLE_THROW(
-          "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
-          "option");
-#endif
-    }
-  }
-}
+Executor::Executor(const platform::Place& place) : place_(place) {}
 
-Executor::~Executor() {
-  if (own_) {
-    for (auto& device_context : device_contexts_) {
-      delete device_context;
-    }
-  }
-}
-
-static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
-  if (var_type == VarDesc::LOD_TENSOR) {
+static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
+  if (var_type == proto::VarDesc::LOD_TENSOR) {
     var->GetMutable<LoDTensor>();
-  } else if (var_type == VarDesc::SELECTED_ROWS) {
+  } else if (var_type == proto::VarDesc::SELECTED_ROWS) {
     var->GetMutable<SelectedRows>();
-  } else if (var_type == VarDesc::FEED_MINIBATCH) {
+  } else if (var_type == proto::VarDesc::FEED_MINIBATCH) {
     var->GetMutable<FeedFetchList>();
-  } else if (var_type == VarDesc::FETCH_LIST) {
+  } else if (var_type == proto::VarDesc::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
-  } else if (var_type == VarDesc::STEP_SCOPES) {
+  } else if (var_type == proto::VarDesc::STEP_SCOPES) {
     var->GetMutable<std::vector<framework::Scope>>();
-  } else if (var_type == VarDesc::LOD_RANK_TABLE) {
+  } else if (var_type == proto::VarDesc::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
-  } else if (var_type == VarDesc::LOD_TENSOR_ARRAY) {
+  } else if (var_type == proto::VarDesc::LOD_TENSOR_ARRAY) {
     var->GetMutable<LoDTensorArray>();
+  } else if (var_type == proto::VarDesc::PLACE_LIST) {
+    var->GetMutable<platform::PlaceList>();
   } else {
     PADDLE_THROW(
         "Variable type %d is not in "
-        "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE]",
+        "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE,"
+        " PLACE_LIST]",
         var_type);
   }
 }
 
-void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
-                   bool create_local_scope) {
+static void CheckTensorNANOrInf(const std::string& name,
+                                const framework::Tensor& tensor) {
+  if (tensor.memory_size() == 0) {
+    return;
+  }
+  if (tensor.type().hash_code() != typeid(float).hash_code() &&
+      tensor.type().hash_code() != typeid(double).hash_code()) {
+    return;
+  }
+  PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name);
+  PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name);
+}
+
+void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
+                   bool create_local_scope, bool create_vars) {
   // TODO(tonyyang-svail):
   //    - only runs on the first device (i.e. no interdevice communication)
   //    - will change to use multiple blocks for RNN op and Cond Op
   PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), pdesc.Size());
   auto& block = pdesc.Block(block_id);
-  auto& device = device_contexts_[0];
 
   Scope* local_scope = scope;
-  if (create_local_scope) {
-    local_scope = &scope->NewScope();
-    for (auto& var : block.AllVars()) {
-      if (var->Persistable()) {
-        auto* ptr = scope->Var(var->Name());
-        CreateTensor(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " global, which pointer is " << ptr;
-      } else {
+  if (create_vars) {
+    if (create_local_scope) {
+      local_scope = &scope->NewScope();
+      for (auto& var : block.AllVars()) {
+        if (var->Name() == framework::kEmptyVarName) {
+          continue;
+        }
+
+        if (var->Persistable()) {
+          auto* ptr = scope->Var(var->Name());
+          CreateTensor(ptr, var->GetType());
+          VLOG(3) << "Create Variable " << var->Name()
+                  << " global, which pointer is " << ptr;
+        } else {
+          auto* ptr = local_scope->Var(var->Name());
+          CreateTensor(ptr, var->GetType());
+          VLOG(3) << "Create Variable " << var->Name()
+                  << " locally, which pointer is " << ptr;
+        }
+      }
+    } else {
+      for (auto& var : block.AllVars()) {
         auto* ptr = local_scope->Var(var->Name());
         CreateTensor(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " locally, which pointer is " << ptr;
+        VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
+                << ptr;
       }
-    }
-  } else {
-    for (auto& var : block.AllVars()) {
-      auto* ptr = local_scope->Var(var->Name());
-      CreateTensor(ptr, var->GetType());
-      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
-              << ptr;
-    }
-  }
+    }  // if (create_local_scope)
+  }    // if (create_vars)
 
   for (auto& op_desc : block.AllOps()) {
     auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
-    VLOG(10) << op->DebugString();
-    op->Run(*local_scope, *device);
+    VLOG(3) << op->DebugStringEx(local_scope);
+    op->Run(*local_scope, place_);
+    if (FLAGS_check_nan_inf) {
+      for (auto& vname : op->OutputVars(true)) {
+        auto* var = local_scope->FindVar(vname);
+        if (var == nullptr) continue;
+        if (var->IsType<framework::LoDTensor>()) {
+          CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+        }
+      }
+    }
   }
-  if (create_local_scope) {
+  if (create_vars && create_local_scope) {
     scope->DeleteScope(local_scope);
   }
 }
 
-Executor::Executor(const platform::DeviceContext& device)
-    : device_contexts_({&device}), own_(false) {}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index b745f4f6474ef688774f4c833a3958942e9aa8cb..d869e18901b82959a40cc296aa0844c20ea63ac1 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -18,15 +18,18 @@ limitations under the License. */
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
 
 class Executor {
  public:
-  explicit Executor(const std::vector<platform::Place>& places);
-  explicit Executor(const platform::DeviceContext& devices);
-  ~Executor();
+  // TODO(dzhwinter) : Do not rely on this function, it will be removed
+  explicit Executor(const platform::DeviceContext& device)
+      : Executor(device.GetPlace()) {}
+
+  explicit Executor(const platform::Place& place);
 
   /* @Brief
    * Runtime evaluation of the given ProgramDesc under certain Scope
@@ -35,11 +38,11 @@ class Executor {
    *  ProgramDesc
    *  Scope
    */
-  void Run(const ProgramDescBind&, Scope*, int, bool create_local_scope = true);
+  void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true,
+           bool create_vars = true);
 
  private:
-  std::vector<const platform::DeviceContext*> device_contexts_;
-  bool own_;
+  const platform::Place place_;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/feed_fetch_type.h b/paddle/framework/feed_fetch_type.h
index bc4ae440fc708f696c18bb9d5ab3ba7dd59e21ab..9bc4a90c44828ecb7458d524f59609f01848cc5c 100644
--- a/paddle/framework/feed_fetch_type.h
+++ b/paddle/framework/feed_fetch_type.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <vector>
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index f1fc4529e15502927560eefd74110f6ca7eab4a9..ea69b87e2ac7dc587333b623c310182bb39eb452 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -14,7 +14,7 @@ limitations under the License. */
 
 syntax = "proto2";
 option optimize_for = LITE_RUNTIME;
-package paddle.framework;
+package paddle.framework.proto;
 
 enum AttrType {
   INT = 0;
@@ -123,6 +123,7 @@ message VarDesc {
     STEP_SCOPES = 5;
     LOD_RANK_TABLE = 6;
     LOD_TENSOR_ARRAY = 7;
+    PLACE_LIST = 8;
   }
   required string name = 1;
   required VarType type = 2;
diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h
index 998186e33915a11f2864eb5387d19ed1bfbab51c..2082f8bb76fb62bc36f033fecbd4eaa76d12d949 100644
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <string>
@@ -22,21 +22,27 @@
 namespace paddle {
 namespace framework {
 
+/*
+  This functor class is responsible for creating the gradient ops for the given
+  operator fwd_op. After it is called (through operator()), the pairs of
+  (gradient variable, corresponding input variable of fwd_op) will be added to
+  grad_to_var. If an input variable of fwd_op is contained in no_grad_set, its
+  gradient varialbe will be ignored or kEmptyVarName depending on the template
+  argument DropEmptyIG in the derived classes.
+ */
 class GradOpDescMakerBase {
  public:
   explicit GradOpDescMakerBase(
-      const OpDescBind& fwd_op,
-      const std::unordered_set<std::string>& no_grad_set,
+      const OpDesc& fwd_op, const std::unordered_set<std::string>& no_grad_set,
       std::unordered_map<std::string, std::string>* grad_to_var,
-      const std::vector<BlockDescBind*>& grad_block =
-          std::vector<BlockDescBind*>())
+      const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>())
       : fwd_op_(fwd_op),
         no_grad_set_(no_grad_set),
         grad_to_var_(grad_to_var),
         grad_block_(grad_block) {}
 
   virtual ~GradOpDescMakerBase() = default;
-  virtual std::vector<std::unique_ptr<OpDescBind>> operator()() const = 0;
+  virtual std::vector<std::unique_ptr<OpDesc>> operator()() const = 0;
 
  protected:
   std::vector<std::string> InputGrad(const std::string& name,
@@ -58,6 +64,16 @@ class GradOpDescMakerBase {
     if (!drop_empty_grad) {
       return ret_val;
     }
+    PADDLE_ENFORCE_LE(var_names.size(), 1UL,
+                      "BUG from operator developer:"
+                      " for input argument with a list of variables, "
+                      " drop_empty_grad is not allowed because it makes"
+                      " the correspondence bewteen a variable and its gradient"
+                      " ambiguous. Use REGISTER_OP_EX to register the op"
+                      " or call InputGrad(?,false) in GradOpDescMaker."
+                      " Op type %s",
+                      fwd_op_.Type());
+
     std::vector<std::string> dropped_ret_val;
     dropped_ret_val.reserve(ret_val.size());
     std::copy_if(ret_val.begin(), ret_val.end(),
@@ -71,7 +87,11 @@ class GradOpDescMakerBase {
     auto onames = this->Output(name);
     ret_val.reserve(onames.size());
     std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val),
-                   GradVarName);
+                   [this](const std::string& fwd_var_name) -> std::string {
+                     auto g_name = GradVarName(fwd_var_name);
+                     (*this->grad_to_var_)[g_name] = fwd_var_name;
+                     return g_name;
+                   });
     return ret_val;
   }
 
@@ -105,26 +125,26 @@ class GradOpDescMakerBase {
   std::string ForwardOpType() const { return this->fwd_op_.Type(); }
 
  private:
-  const OpDescBind& fwd_op_;
+  const OpDesc& fwd_op_;
   const std::unordered_set<std::string>& no_grad_set_;
   std::unordered_map<std::string, std::string>* grad_to_var_;
 
  protected:
-  std::vector<BlockDescBind*> grad_block_;
+  std::vector<BlockDesc*> grad_block_;
 };
 
 class SingleGradOpDescMaker : public GradOpDescMakerBase {
  public:
   using GradOpDescMakerBase::GradOpDescMakerBase;
 
-  std::vector<std::unique_ptr<OpDescBind>> operator()() const {
-    std::vector<std::unique_ptr<OpDescBind>> retv;
+  std::vector<std::unique_ptr<OpDesc>> operator()() const {
+    std::vector<std::unique_ptr<OpDesc>> retv;
     retv.emplace_back(this->Apply());
     return retv;
   }
 
  protected:
-  virtual std::unique_ptr<OpDescBind> Apply() const = 0;
+  virtual std::unique_ptr<OpDesc> Apply() const = 0;
 };
 
 template <bool DropEmptyIG = true>
@@ -133,8 +153,8 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
   using SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  virtual std::unique_ptr<OpDescBind> Apply() const {
-    auto* grad = new OpDescBind();
+  virtual std::unique_ptr<OpDesc> Apply() const {
+    auto* grad = new OpDesc();
     grad->SetType(this->GradOpType());
 
     for (auto& input_param : this->InputNames()) {
@@ -150,7 +170,7 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
 
     grad->SetAttrMap(this->Attrs());
 
-    return std::unique_ptr<OpDescBind>(grad);
+    return std::unique_ptr<OpDesc>(grad);
   }
 
   virtual std::string GradOpType() const {
@@ -161,7 +181,7 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
 class EmptyGradOpMaker : public GradOpDescMakerBase {
  public:
   using GradOpDescMakerBase::GradOpDescMakerBase;
-  std::vector<std::unique_ptr<OpDescBind>> operator()() const override {
+  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
     return {};
   }
 };
diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ef82a541efaa35bcf831d5122570154f2fa2423
--- /dev/null
+++ b/paddle/framework/init.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string.h>  // for strdup
+#include <algorithm>
+#include <string>
+
+#include "paddle/framework/init.h"
+#include "paddle/framework/operator.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+#include "paddle/string/piece.h"
+
+namespace paddle {
+namespace framework {
+
+std::once_flag gflags_init_flag;
+
+void InitGflags(std::vector<std::string> &argv) {
+  std::call_once(gflags_init_flag, [&]() {
+    int argc = argv.size();
+    char **arr = new char *[argv.size()];
+    std::string line;
+    for (size_t i = 0; i < argv.size(); i++) {
+      arr[i] = &argv[i][0];
+      line += argv[i];
+      line += ' ';
+    }
+    google::ParseCommandLineFlags(&argc, &arr, true);
+    VLOG(1) << "Init commandline: " << line;
+  });
+}
+
+void InitDevices() {
+  /*Init all avaiable devices by default */
+
+  std::vector<platform::Place> places;
+  places.emplace_back(platform::CPUPlace());
+
+#ifdef PADDLE_WITH_CUDA
+  int count = platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(platform::CUDAPlace(i));
+  }
+#else
+  LOG(WARNING)
+      << "'GPU' is not supported, Please re-compile with WITH_GPU option";
+#endif
+
+  platform::DeviceContextPool::Init(places);
+}
+
+void InitGLOG(const std::string &prog_name) {
+  // glog will not hold the ARGV[0] inside.
+  // Use strdup to alloc a new string.
+  google::InitGoogleLogging(strdup(prog_name.c_str()));
+  google::InstallFailureSignalHandler();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/init.h b/paddle/framework/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8fd964d006baf729888414ded2aec85ba5a024e
--- /dev/null
+++ b/paddle/framework/init.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <mutex>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+namespace paddle {
+namespace framework {
+
+void InitGflags(std::vector<std::string> &argv);
+
+void InitGLOG(const std::string &prog_name);
+
+void InitDevices();
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/init_test.cc b/paddle/framework/init_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f837a965d3be7d40c20803ae4462b3bfd91bffd0
--- /dev/null
+++ b/paddle/framework/init_test.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
+#include "paddle/platform/device_context.h"
+
+TEST(InitDevices, CPU) {
+  using paddle::framework::InitDevices;
+  using paddle::platform::DeviceContextPool;
+
+  InitDevices();
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  ASSERT_GE(pool.size(), 1U);
+}
diff --git a/paddle/framework/library_type.h b/paddle/framework/library_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e3084835439b0d55de72a669b93acbaef7ed6b9
--- /dev/null
+++ b/paddle/framework/library_type.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cctype>
+
+namespace paddle {
+namespace framework {
+
+// For more details about the design of LibraryType, Please refer to
+// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library
+
+enum class LibraryType {
+  kPlain = 0,
+  kMKLDNN = 1,
+  kCUDNN = 2,
+};
+
+inline std::string LibraryTypeToString(const LibraryType& library_type) {
+  switch (library_type) {
+    case LibraryType::kPlain:
+      return "PLAIN";
+    case LibraryType::kMKLDNN:
+      return "MKLDNN";
+    case LibraryType::kCUDNN:
+      return "CUDNN";
+    default:
+      PADDLE_THROW("unknown LibraryType %d", static_cast<int>(library_type));
+  }
+}
+
+inline LibraryType StringToLibraryType(const char* ctype) {
+  std::string s(ctype);
+  for (size_t i = 0; i < s.size(); ++i) {
+    s[i] = toupper(s[i]);
+  }
+  if (s == std::string("PLAIN")) {
+    return LibraryType::kPlain;
+  } else if (s == std::string("MKLDNN")) {
+    return LibraryType::kMKLDNN;
+  } else if (s == std::string("CUDNN")) {
+    return LibraryType::kCUDNN;
+    // To be compatible with register macro.
+    // CPU, CUDA, PLAIN are same library type.
+  } else if (s == std::string("CPU")) {
+    return LibraryType::kPlain;
+  } else if (s == std::string("CUDA")) {
+    return LibraryType::kPlain;
+  } else {
+    PADDLE_THROW("Unknown LibraryType %s", s.c_str());
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& out, LibraryType l) {
+  out << LibraryTypeToString(l);
+  return out;
+}
+
+}  // namespace
+}  // framework
diff --git a/paddle/framework/lod_rank_table.cc b/paddle/framework/lod_rank_table.cc
index 1c2fba70c8ab0827ba6d1563f08cd0820650822e..704bce2a0eb60b974efd41a4edda0af2933da825 100644
--- a/paddle/framework/lod_rank_table.cc
+++ b/paddle/framework/lod_rank_table.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/lod_rank_table.h"
 
@@ -46,4 +46,13 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
 }
 
 }  // namespace framework
+
+std::ostream& operator<<(std::ostream& out,
+                         const framework::LoDRankTable& table) {
+  out << "NumOfSequence " << table.items().size() << "\n";
+  for (auto& each_item : table.items()) {
+    out << "\tSeq #" << each_item.index << ", Len=" << each_item.length << "\n";
+  }
+  return out;
+}
 }  // namespace paddle
diff --git a/paddle/framework/lod_rank_table.h b/paddle/framework/lod_rank_table.h
index 9faa3a4d7bdc55ab7b24e31f5e5434dacc0a4b36..df188709e91871ded0258fa5703ee16a5664f057 100644
--- a/paddle/framework/lod_rank_table.h
+++ b/paddle/framework/lod_rank_table.h
@@ -1,18 +1,19 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
+#include <iosfwd>
 #include "paddle/framework/lod_tensor.h"
 
 namespace paddle {
@@ -52,4 +53,8 @@ class LoDRankTable {
 };
 
 }  // namespace framework
+
+std::ostream& operator<<(std::ostream& out,
+                         const framework::LoDRankTable& table);
+
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index a0f2906c749054c1ff9f624e47df432ec2bd6ac8..87a57d095141cc456af2cbabbc227715a02375e9 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -1,18 +1,20 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
 
 #include "paddle/memory/memcpy.h"
 #include "paddle/memory/memory.h"
@@ -27,11 +29,11 @@
 namespace paddle {
 namespace framework {
 
-std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+std::ostream &operator<<(std::ostream &os, const LoD &lod) {
   os << "{";
-  for (auto& v : lod) {
+  for (auto &v : lod) {
     os << "{";
-    for (auto& i : v) {
+    for (auto &i : v) {
       os << i << ",";
     }
     os << "}";
@@ -41,19 +43,39 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) {
   return os;
 }
 
-LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
-  LoD new_lod;
-  new_lod.reserve(level_end - level_begin);
-  for (size_t i = level_begin; i < level_end; i++) {
-    new_lod.emplace_back(in.at(i));
+std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
+  PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code());
+
+  if (!platform::is_cpu_place(t.place())) {
+    LoDTensor tt;
+    framework::Copy(t, platform::CPUPlace(), &tt);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(t.place());
+    dev_ctx.Wait();
+
+    os << tt;
+    return os;
+  }
+
+  os << "dim: " << t.dims() << "\n";
+  os << "lod: " << t.lod() << "\n";
+
+  // only print first ten elements
+  int64_t size = t.numel() < 10 ? t.numel() : 10;
+  for (int64_t i = 0; i < size; ++i) {
+    os << t.data<float>()[i] << " ";
   }
-  // transform the lowest level to absolute offset.
-  LoD abs_offset_lod = ToAbsOffset(in);
-  new_lod.back() = abs_offset_lod[level_end - 1];
-  return new_lod;
+
+  return os;
+}
+
+std::string LoDToString(const LoD &lod) {
+  std::ostringstream stream;
+  stream << lod;
+  return stream.str();
 }
 
-LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
+LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
                  size_t elem_end) {
   PADDLE_ENFORCE_LT(level, in.size());
   PADDLE_ENFORCE_LT(elem_end, in[level].size());
@@ -64,9 +86,9 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
   res[0].assign(in[level].begin() + elem_begin,
                 in[level].begin() + elem_end + 1);
   for (size_t lvl = 1; lvl < res.size(); lvl++) {
-    const auto& in_level = in[level + lvl];
-    const auto& above_level = res[lvl - 1];
-    auto& out_level = res[lvl];
+    const auto &in_level = in[level + lvl];
+    const auto &above_level = res[lvl - 1];
+    auto &out_level = res[lvl];
     out_level.assign(in_level.begin() + above_level.front(),
                      in_level.begin() + above_level.back() + 1);
   }
@@ -74,33 +96,33 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
     // to make the first offset equals 0, all the elements minus the first
     // element
     size_t front = res[lvl].front();
-    for (auto& ele : res[lvl]) {
+    for (auto &ele : res[lvl]) {
       ele -= front;
     }
   }
   return res;
 }
 
-LoD ToAbsOffset(const LoD& in) {
+LoD ToAbsOffset(const LoD &in) {
   // the lowest level stores relative offsets
   if (in.empty() || in.size() == 1) return in;
   LoD result = in;
   for (int level = result.size() - 2; level >= 0; level--) {
-    for (auto& ele : result[level]) {
+    for (auto &ele : result[level]) {
       ele = result[level + 1][ele];
     }
   }
   return result;
 }
 
-bool operator==(const LoD& a, const LoD& b) {
+bool operator==(const LoD &a, const LoD &b) {
   if (a.size() != b.size()) {
     return false;
   }
 
   for (size_t i = 0; i < a.size(); i++) {
-    const auto& a_level = a[i];
-    const auto& b_level = b[i];
+    const auto &a_level = a[i];
+    const auto &b_level = b[i];
     if (a_level.size() != b_level.size()) {
       return false;
     }
@@ -113,45 +135,8 @@ bool operator==(const LoD& a, const LoD& b) {
   return true;
 }
 
-size_t LoDTensor::NumElements(size_t level, size_t idx) const {
-  PADDLE_ENFORCE_LT(level, NumLevels());
-  PADDLE_ENFORCE_LT(idx, NumElements(level));
-  return lod_[level][idx + 1] - lod_[level][idx];
-}
-
-size_t LoDTensor::NumInstancesInElement(size_t level, size_t idx) const {
-  PADDLE_ENFORCE_LT(level, NumLevels());
-  PADDLE_ENFORCE_LT(idx, NumElements(level));
-  auto abs_lod = ToAbsOffset(lod());
-  size_t begin = abs_lod[level][idx];
-  size_t end = abs_lod[level][idx + 1];
-  return end - begin;
-}
-
-void LoDTensor::ShrinkLevels(size_t level_begin, size_t level_end) {
-  auto new_lod = framework::SliceLevels(lod_, level_begin, level_end);
-  lod_ = new_lod;
-}
-
-void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
-                              size_t elem_end) {
-  PADDLE_ENFORCE_LT(level, NumLevels());
-  PADDLE_ENFORCE_LT(elem_begin, NumElements(level));
-  PADDLE_ENFORCE_LT(elem_end, NumElements(level) + 1);
-
-  auto abs_lod = framework::ToAbsOffset(lod());
-  auto new_lod = framework::SliceInLevel(lod_, level, elem_begin, elem_end);
-  lod_ = new_lod;
-
-  // slice the underlying tensor
-  size_t begin = abs_lod[level][elem_begin];
-  size_t end = abs_lod[level][elem_end];
-  PADDLE_ENFORCE_LT(begin, end, "Cannot shrink, the result tensor is empty.");
-  ShareDataWith(Slice(begin, end));
-}
-
 using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
-LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
                                         size_t end_idx, size_t start_level) {
   LoD sub_lod;
 
@@ -170,20 +155,128 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
   return LoDAndOffset{sub_lod, {start_idx, end_idx}};
 }
 
-void AppendLoD(LoD* lod, const LoD& lod_length) {
+void AppendLoD(LoD *lod, const LoD &lod_length) {
   PADDLE_ENFORCE(
       lod->empty() || lod->size() == lod_length.size(),
       "The lod_length should has the same size with the appended lod.");
   if (lod->empty()) {
+    for (size_t i = 0; i < lod_length.size(); ++i) {
+      lod->emplace_back(1, 0);  // size = 1, value = 0;
+    }
     *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
   }
   for (size_t i = 0; i < lod->size(); ++i) {
-    auto& level = (*lod)[i];
+    auto &level = (*lod)[i];
     for (size_t len : lod_length[i]) {
       level.push_back(level.back() + len);
     }
   }
 }
 
+void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
+                       const platform::DeviceContext &dev_ctx) {
+  {  // the 1st field, uint32_t version for LoDTensor
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+  }
+  {
+    // the 2st field, LoD information
+    // uint64_t lod_level
+    // uint64_t lod_level_1 size in byte.
+    // int*     lod_level_1 data
+    // ...
+    auto lod = tensor.lod();
+    uint64_t size = lod.size();
+    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+
+    for (auto &each : lod) {
+      size = each.size() * sizeof(framework::LoD::value_type::value_type);
+      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+      os.write(reinterpret_cast<const char *>(each.data()),
+               static_cast<std::streamsize>(size));
+    }
+  }
+  // the 3st field, Tensor
+  SerializeToStream(os, static_cast<Tensor>(tensor), dev_ctx);
+}
+
+void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
+                           const platform::DeviceContext &dev_ctx) {
+  {
+    // the 1st field, unit32_t version for LoDTensor
+    uint32_t version;
+    is.read(reinterpret_cast<char *>(&version), sizeof(version));
+    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  }
+  {
+    // the 2st field, LoD information
+    uint64_t lod_level;
+    is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+    auto &lod = *tensor->mutable_lod();
+    lod.resize(lod_level);
+    for (uint64_t i = 0; i < lod_level; ++i) {
+      uint64_t size;
+      is.read(reinterpret_cast<char *>(&size), sizeof(size));
+      std::vector<size_t> tmp(size / sizeof(size_t));
+      is.read(reinterpret_cast<char *>(tmp.data()),
+              static_cast<std::streamsize>(size));
+      lod[i] = tmp;
+    }
+  }
+  // the 3st filed, Tensor
+  DeserializeFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
+}
+
+// TODO(tonyyang-svail): make this function support LoD
+std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
+    const std::vector<platform::Place> places) const {
+  check_memory_size();
+  PADDLE_ENFORCE(lod().empty(), "Disable parallel lod for now");
+  PADDLE_ENFORCE(dims()[0] % places.size() == 0,
+                 "Batch size should be divided by places size");
+
+  std::vector<LoDTensor> lods;
+  for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
+    int begin = place_idx * dims()[0] / places.size();
+    int end = (place_idx + 1) * dims()[0] / places.size();
+
+    auto src = Slice(begin, end);
+    auto &dst_place = places[place_idx];
+    LoDTensor dst;
+    framework::Copy(src, dst_place, &dst);
+
+    lods.emplace_back(dst);
+  }
+
+  return lods;
+}
+
+// TODO(tonyyang-svail): make this function support LoD
+void LoDTensor::MergeLoDTensor(
+    const std::vector<const LoDTensor *> &lod_tensors,
+    platform::Place dst_place) {
+  PADDLE_ENFORCE(!lod_tensors.empty());
+  framework::DDim new_dim = lod_tensors[0]->dims();
+  std::type_index new_type = lod_tensors[0]->type();
+  auto new_layout = lod_tensors[0]->layout();
+  for (auto *lod : lod_tensors) {
+    PADDLE_ENFORCE(new_dim == lod->dims());
+    PADDLE_ENFORCE(new_type == lod->type());
+    PADDLE_ENFORCE(new_layout == lod->layout());
+  }
+  new_dim[0] *= lod_tensors.size();
+  Resize(new_dim);
+  set_layout(new_layout);
+
+  mutable_data(dst_place, new_type);
+  int begin = 0;
+  for (auto *src : lod_tensors) {
+    int end = begin + src->dims()[0];
+    auto dst = Slice(begin, end);
+    framework::Copy(*src, dst_place, &dst);
+    begin = end;
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 7f8a51cc581e759bc707e506ac7cdeb3680f40ac..88ea78f2682b2ffc962c9663f6b3c636dedb931d 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -24,6 +24,7 @@
 #include <glog/logging.h>
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 
@@ -57,14 +58,9 @@ using Vector = thrust::host_vector<
 using LoD = std::vector<Vector<size_t>>;
 
 std::ostream& operator<<(std::ostream& os, const LoD& lod);
+std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
 
-/*
- * Slice levels from a LoD.
- * NOTE the lowest level should always be the absolute offsets of the underlying
- * tensor instances. So if higher layers are sliced without the lowest level,
- * the lower level of the sliced LoD will be transformed to the absolute offset.
- */
-LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end);
+std::string LoDToString(const LoD& lod);
 
 LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
                  size_t elem_end);
@@ -114,34 +110,11 @@ class LoDTensor : public Tensor {
     return (lod_)[level].size() - 1;
   }
 
-  /*
-   * Number of lower-level elements.
-   * For example, a 2-level lod-tensor
-   *
-   * 0-th level   |   |
-   * 1-th level   ||  |||
-   *
-   * NumElements(0, 0) get 2
-   * NumElements(0, 1) get 3
-   */
-  size_t NumElements(size_t level, size_t idx) const;
+  std::vector<LoDTensor> SplitLoDTensor(
+      const std::vector<platform::Place> places) const;
 
-  /*
-   * Get the number of instances in the underlying tensor in the `idx`-th
-   * element.
-   */
-  size_t NumInstancesInElement(size_t level, size_t idx) const;
-
-  /*
-   * Shrink levels[level_begin:level_end]
-   */
-  void ShrinkLevels(size_t level_begin, size_t level_end);
-
-  /*
-   * Shrink elements of a level, [elem_begin: elem_end]
-   * @note: low performance in slice lod_.
-   */
-  void ShrinkInLevel(size_t level, size_t elem_begin, size_t elem_end);
+  void MergeLoDTensor(const std::vector<const LoDTensor*>& lod_tensors,
+                      platform::Place place);
 
  private:
   LoD lod_;
@@ -175,18 +148,40 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
   PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1);
   for (size_t ins = 0; ins < num_instances; ins++) {
     for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
-      tensor.Slice(elem, elem + 1)
-          .CopyFrom(source.Slice(ins, ins + 1), platform::CPUPlace(),
-                    platform::CPUDeviceContext());
+      auto slice = tensor.Slice(elem, elem + 1);
+      Copy(source.Slice(ins, ins + 1), platform::CPUPlace(),
+           platform::CPUDeviceContext(), &slice);
     }
   }
   return tensor;
 }
 
+// Get the absolute offset of a lod[start_level][start_idx:end_idx] and
+// relative length of details for every levels(i.e., [start_level: ]).
+//
+// For example,
+//   lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]]
+//   start_level = 0
+//   start_idx = 1
+//   end_idx = 3
+//
+// Returns:
+//  LoD = [[1, 4], [2, 4, 2, 3, 2]]
+//  pair<size_t, size_t> = {11, 24}
 std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
     const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);
 
 void AppendLoD(LoD* lod, const LoD& lod_length);
 
+/*
+ * Serialize/Desiralize LoDTensor to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
+                       const platform::DeviceContext& dev_ctx);
+void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
+                           const platform::DeviceContext& dev_ctx);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor_array.h b/paddle/framework/lod_tensor_array.h
index 13f0608d24be97d8bba149b74f1a4deb57deeb48..4a8e7f4fa540b1c2f19a6e3ec236a0dd5c0daf0b 100644
--- a/paddle/framework/lod_tensor_array.h
+++ b/paddle/framework/lod_tensor_array.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <vector>
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index 02d84b68233f2fdfc66e1df2fc7ce20307cadd94..19ae7815cc1008f9f468821d61b26f221e266d5e 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 /*
   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
@@ -54,78 +67,6 @@ class LoDTensorTester : public ::testing::Test {
   LoDTensor lod_tensor_;
 };
 
-TEST_F(LoDTensorTester, NumLevels) { ASSERT_EQ(lod_tensor_.NumLevels(), 3UL); }
-
-TEST_F(LoDTensorTester, NumElements) {
-  ASSERT_EQ(lod_tensor_.NumElements(0), 2UL);
-  ASSERT_EQ(lod_tensor_.NumElements(1), 3UL);
-  ASSERT_EQ(lod_tensor_.NumElements(2), 8UL);
-}
-
-TEST_F(LoDTensorTester, NumElements2) {
-  ASSERT_EQ(lod_tensor_.NumElements(0, 0), 2UL);
-  ASSERT_EQ(lod_tensor_.NumElements(0, 1), 1UL);
-  ASSERT_EQ(lod_tensor_.NumElements(1, 1), 3UL);
-}
-
-TEST_F(LoDTensorTester, ShrinkLevels) {
-  // slice 1 level
-  for (size_t level = 0; level < 3UL; ++level) {
-    LoDTensor new_lod_tensor = lod_tensor_;
-    new_lod_tensor.ShrinkLevels(level, level + 1);
-    ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
-    ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
-  }
-  // shrink 2 level
-  for (size_t level = 0; level < 2UL; ++level) {
-    LoDTensor new_lod_tensor = lod_tensor_;
-    new_lod_tensor.ShrinkLevels(level, level + 2);
-    // the lowest level's last element should be the tensor's batch_size.
-    ASSERT_EQ(new_lod_tensor.lod().back().back(),
-              lod_tensor_.lod().back().back());
-    ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
-    ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
-  }
-}
-
-TEST_F(LoDTensorTester, ShrinkInLevel) {
-  size_t level = 0;
-  LoDTensor new_lod_tensor = lod_tensor_;
-  new_lod_tensor.ShrinkInLevel(level, 0, 1);
-  ASSERT_EQ(new_lod_tensor.NumLevels(), 3UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(0), 1UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(1), 2UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(2), 5UL);
-  ASSERT_EQ(new_lod_tensor.dims()[0], 12);
-  for (int i = 0; i < 12 * 128; i++) {
-    ASSERT_EQ(new_lod_tensor.data<float>()[i], i);
-  }
-
-  level = 1;
-  new_lod_tensor = lod_tensor_;
-  new_lod_tensor.ShrinkInLevel(level, 1, 2);
-  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(0), 1UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(1), 3UL);
-  ASSERT_EQ(new_lod_tensor.dims()[0], 7);
-  for (int i = 5 * 128; i < 12 * 128; i++) {
-    ASSERT_EQ(new_lod_tensor.data<float>()[i - 5 * 128], i);
-  }
-
-  LoDTensor t1;
-  t1.set_lod(lod_tensor_.lod());
-  t1.ShareDataWith(lod_tensor_);
-
-  LoDTensor t2;
-  t2.set_lod(lod_tensor_.lod());
-  t2.ShareDataWith(lod_tensor_);
-
-  t1.ShrinkInLevel(0, 1, 2);
-  t2.ShrinkInLevel(0, 0, 1);
-  EXPECT_NE(t1.data<float>(), t2.data<float>());
-  EXPECT_NE(t1.data<float>(), lod_tensor_.data<float>());
-}
-
 TEST(LodExpand, test) {
   LoD lod{{0, 2}};
   LoDTensor tensor;
@@ -187,5 +128,21 @@ TEST(LoD, AppendLoD) {
   EXPECT_EQ(origin, expected);
 }
 
+TEST(LoD, ToAbsOffset) {
+  LoD relative_lod;
+  relative_lod.push_back(std::vector<size_t>({0, 2}));
+  relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
+  relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  LoD abs_lod = paddle::framework::ToAbsOffset(relative_lod);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({0, 5}));
+  expected.push_back(std::vector<size_t>({0, 2, 5}));
+  expected.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  EXPECT_EQ(abs_lod, expected);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu
index 5b90fbfca7f6bec4f2c862d0ff18dfd7cf39e181..0f46e9b1e3966a49ff0673231c65f608797c80fe 100644
--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 /*
   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,7 +40,7 @@ __global__ void test(size_t* a, int size) {
 
 TEST(LoDTensor, LoDInGPU) {
   paddle::framework::LoDTensor lod_tensor;
-  paddle::platform::GPUPlace place(0);
+  paddle::platform::CUDAPlace place(0);
 
   paddle::framework::LoD src_lod;
   src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14});
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 48cd131550dea5ad3f368b25c31d753efbe0dff9..1c0372bb16c04e155a68a0411939e4887322107a 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class OpDescBind;
-class BlockDescBind;
+class OpDesc;
+class BlockDesc;
 class CompileTimeInferShapeContext : public InferShapeContext {
  public:
-  CompileTimeInferShapeContext(const OpDescBind &op,
-                               const BlockDescBind &block);
+  CompileTimeInferShapeContext(const OpDesc &op, const BlockDesc &block);
 
   bool HasInput(const std::string &name) const override;
 
@@ -58,31 +57,31 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     PADDLE_ENFORCE_LT(j, Outputs(out).size());
     auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
     auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
-    if (in_var->GetType() != VarDesc::LOD_TENSOR) {
-      VLOG(3) << "input " << in << "is not LodTensor";
+    if (in_var->GetType() != proto::VarDesc::LOD_TENSOR) {
+      VLOG(3) << "input " << in << " is not LodTensor";
       return;
     }
-    PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
+    PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarDesc::LOD_TENSOR,
                       "The %d-th output of Output(%s) must be LoDTensor.", j,
                       out);
-    in_var->SetLoDLevel(out_var->GetLodLevel());
+    out_var->SetLoDLevel(in_var->GetLoDLevel());
   }
+
   bool IsRuntime() const override;
 
  protected:
-  VarDesc::VarType GetVarType(const std::string &name) const override;
+  proto::VarDesc::VarType GetVarType(const std::string &name) const override;
 
   DDim GetDim(const std::string &name) const override;
 
   void SetDim(const std::string &name, const DDim &dim) override;
 
-  const OpDescBind &op_;
-  const BlockDescBind &block_;
+  const OpDesc &op_;
+  const BlockDesc &block_;
 };
 
-OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const AttributeMap &attrs) {
+OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
+               const VariableNameMap &outputs, const AttributeMap &attrs) {
   desc_.set_type(type);
   inputs_ = inputs;
   outputs_ = outputs;
@@ -90,12 +89,20 @@ OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
   need_update_ = true;
 }
 
-OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
+void OpDesc::CopyFrom(const OpDesc &op_desc) {
+  desc_.set_type(op_desc.Type());
+  inputs_ = op_desc.inputs_;
+  outputs_ = op_desc.outputs_;
+  attrs_ = op_desc.attrs_;
+  need_update_ = true;
+}
+
+OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
     : desc_(desc), need_update_(false) {
   // restore inputs_
   int input_size = desc_.inputs_size();
   for (int i = 0; i < input_size; ++i) {
-    const OpDesc::Var &var = desc_.inputs(i);
+    const proto::OpDesc::Var &var = desc_.inputs(i);
     std::vector<std::string> &args = inputs_[var.parameter()];
     int argu_size = var.arguments_size();
     args.reserve(argu_size);
@@ -106,7 +113,7 @@ OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
   // restore outputs_
   int output_size = desc_.outputs_size();
   for (int i = 0; i < output_size; ++i) {
-    const OpDesc::Var &var = desc_.outputs(i);
+    const proto::OpDesc::Var &var = desc_.outputs(i);
     std::vector<std::string> &args = outputs_[var.parameter()];
     int argu_size = var.arguments_size();
     args.reserve(argu_size);
@@ -115,9 +122,9 @@ OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
     }
   }
   // restore attrs_
-  for (const OpDesc::Attr &attr : desc_.attrs()) {
+  for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
     std::string attr_name = attr.name();
-    if (attr.type() != AttrType::BLOCK) {
+    if (attr.type() != proto::AttrType::BLOCK) {
       attrs_[attr_name] = GetAttrValue(attr);
     } else {
       auto bid = attr.block_idx();
@@ -126,20 +133,19 @@ OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
   }
 }
 
-OpDesc *OpDescBind::Proto() {
+proto::OpDesc *OpDesc::Proto() {
   Flush();
   return &desc_;
 }
 
-const std::vector<std::string> &OpDescBind::Input(
-    const std::string &name) const {
+const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
   auto it = inputs_.find(name);
   PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name,
                  Type());
   return it->second;
 }
 
-std::vector<std::string> OpDescBind::InputArgumentNames() const {
+std::vector<std::string> OpDesc::InputArgumentNames() const {
   std::vector<std::string> retv;
   for (auto &ipt : this->inputs_) {
     retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
@@ -147,21 +153,20 @@ std::vector<std::string> OpDescBind::InputArgumentNames() const {
   return retv;
 }
 
-void OpDescBind::SetInput(const std::string &param_name,
-                          const std::vector<std::string> &args) {
+void OpDesc::SetInput(const std::string &param_name,
+                      const std::vector<std::string> &args) {
   need_update_ = true;
   inputs_[param_name] = args;
 }
 
-const std::vector<std::string> &OpDescBind::Output(
-    const std::string &name) const {
+const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
   auto it = outputs_.find(name);
   PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s",
                  name, Type());
   return it->second;
 }
 
-std::vector<std::string> OpDescBind::OutputArgumentNames() const {
+std::vector<std::string> OpDesc::OutputArgumentNames() const {
   std::vector<std::string> retv;
   for (auto &ipt : this->outputs_) {
     retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
@@ -169,19 +174,19 @@ std::vector<std::string> OpDescBind::OutputArgumentNames() const {
   return retv;
 }
 
-void OpDescBind::SetOutput(const std::string &param_name,
-                           const std::vector<std::string> &args) {
+void OpDesc::SetOutput(const std::string &param_name,
+                       const std::vector<std::string> &args) {
   need_update_ = true;
   this->outputs_[param_name] = args;
 }
 
-AttrType OpDescBind::GetAttrType(const std::string &name) const {
+proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
   auto it = attrs_.find(name);
   PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-  return static_cast<AttrType>(it->second.which() - 1);
+  return static_cast<proto::AttrType>(it->second.which() - 1);
 }
 
-std::vector<std::string> OpDescBind::AttrNames() const {
+std::vector<std::string> OpDesc::AttrNames() const {
   std::vector<std::string> retv;
   retv.reserve(attrs_.size());
   for (auto &attr : attrs_) {
@@ -190,41 +195,39 @@ std::vector<std::string> OpDescBind::AttrNames() const {
   return retv;
 }
 
-void OpDescBind::SetAttr(const std::string &name, const Attribute &v) {
+void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
   this->attrs_[name] = v;
   need_update_ = true;
 }
 
-void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) {
+void OpDesc::SetBlockAttr(const std::string &name, BlockDesc &block) {
   this->attrs_[name] = &block;
   need_update_ = true;
 }
 
-void OpDescBind::SetAttrMap(
+void OpDesc::SetAttrMap(
     const std::unordered_map<std::string, Attribute> &attr_map) {
   attrs_ = attr_map;
   need_update_ = true;
 }
 
-Attribute OpDescBind::GetAttr(const std::string &name) const {
+Attribute OpDesc::GetAttr(const std::string &name) const {
   auto it = attrs_.find(name);
   PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
   return it->second;
 }
 
-int OpDescBind::GetBlockAttr(const std::string &name) const {
+int OpDesc::GetBlockAttr(const std::string &name) const {
   auto it = attrs_.find(name);
   PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-  return boost::get<BlockDescBind *>(it->second)->ID();
+  return boost::get<BlockDesc *>(it->second)->ID();
 }
 
-const std::unordered_map<std::string, Attribute> &OpDescBind::GetAttrMap()
-    const {
+const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
   return attrs_;
 }
 
-void OpDescBind::Rename(const std::string &old_name,
-                        const std::string &new_name) {
+void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
   for (auto &input : inputs_) {
     std::replace(input.second.begin(), input.second.end(), old_name, new_name);
   }
@@ -235,8 +238,8 @@ void OpDescBind::Rename(const std::string &old_name,
   need_update_ = true;
 }
 
-void OpDescBind::RenameOutput(const std::string &old_name,
-                              const std::string &new_name) {
+void OpDesc::RenameOutput(const std::string &old_name,
+                          const std::string &new_name) {
   for (auto &output : outputs_) {
     std::replace(output.second.begin(), output.second.end(), old_name,
                  new_name);
@@ -244,8 +247,8 @@ void OpDescBind::RenameOutput(const std::string &old_name,
   need_update_ = true;
 }
 
-void OpDescBind::RenameInput(const std::string &old_name,
-                             const std::string &new_name) {
+void OpDesc::RenameInput(const std::string &old_name,
+                         const std::string &new_name) {
   for (auto &input : inputs_) {
     std::replace(input.second.begin(), input.second.end(), old_name, new_name);
   }
@@ -253,12 +256,18 @@ void OpDescBind::RenameInput(const std::string &old_name,
 }
 
 struct SetAttrDescVisitor : public boost::static_visitor<void> {
-  explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
-  mutable OpDesc::Attr *attr_;
+  explicit SetAttrDescVisitor(proto::OpDesc::Attr *attr) : attr_(attr) {}
+  mutable proto::OpDesc::Attr *attr_;
   void operator()(int v) const { attr_->set_i(v); }
   void operator()(float v) const { attr_->set_f(v); }
   void operator()(const std::string &v) const { attr_->set_s(v); }
-  void operator()(bool b) const { attr_->set_b(b); }
+
+  // Please refer to https://github.com/PaddlePaddle/Paddle/issues/7162
+  template <class T,
+            class = typename std::enable_if<std::is_same<bool, T>::value>::type>
+  void operator()(T b) const {
+    attr_->set_b(b);
+  }
 
   void operator()(const std::vector<int> &v) const {
     VectorToRepeated(v, attr_->mutable_ints());
@@ -272,11 +281,11 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
   void operator()(const std::vector<bool> &v) const {
     VectorToRepeated(v, attr_->mutable_bools());
   }
-  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->idx()); }
+  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
   void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
 };
 
-void OpDescBind::Flush() {
+void OpDesc::Flush() {
   if (need_update_) {
     this->desc_.mutable_inputs()->Clear();
     for (auto &ipt : inputs_) {
@@ -297,7 +306,7 @@ void OpDescBind::Flush() {
       auto *attr_desc = desc_.add_attrs();
       attr_desc->set_name(attr.first);
       attr_desc->set_type(
-          static_cast<framework::AttrType>(attr.second.which() - 1));
+          static_cast<proto::AttrType>(attr.second.which() - 1));
       SetAttrDescVisitor visitor(attr_desc);
       boost::apply_visitor(visitor, attr.second);
     }
@@ -316,8 +325,8 @@ static void InitInferShapeFuncs() {
     for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) {
       auto op_type = kern_pair.first;
       auto &op_info = info_map.at(op_type);
-      auto op =
-          static_cast<OperatorWithKernel *>(op_info.Creator()("", {}, {}, {}));
+      auto op = static_cast<OperatorWithKernel *>(op_info.Creator()(
+          "", VariableNameMap{}, VariableNameMap{}, AttributeMap{}));
       if (op_info.infer_shape_) {  // infer_shape has been registered.
         continue;
       }
@@ -328,7 +337,7 @@ static void InitInferShapeFuncs() {
   });
 }
 
-void OpDescBind::CheckAttrs() {
+void OpDesc::CheckAttrs() {
   PADDLE_ENFORCE(!Type().empty(),
                  "CheckAttr() can not be called before type is setted.");
   auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
@@ -340,7 +349,7 @@ void OpDescBind::CheckAttrs() {
   checker->Check(attrs_);
 }
 
-void OpDescBind::InferShape(const BlockDescBind &block) const {
+void OpDesc::InferShape(const BlockDesc &block) const {
   VLOG(3) << "CompileTime infer shape on " << Type();
   InitInferShapeFuncs();
   auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
@@ -363,7 +372,7 @@ void OpDescBind::InferShape(const BlockDescBind &block) const {
   infer_shape(&ctx);
 }
 
-void OpDescBind::InferVarType(BlockDescBind *block) const {
+void OpDesc::InferVarType(BlockDesc *block) const {
   auto &info = OpInfoMap::Instance().Get(this->Type());
   if (info.infer_var_type_) {
     info.infer_var_type_(*this, block);
@@ -375,14 +384,14 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
     for (auto &out_pair : this->outputs_) {
       for (auto &out_var_name : out_pair.second) {
         block->FindRecursiveOrCreateVar(out_var_name)
-            ->SetType(VarDesc::LOD_TENSOR);
+            .SetType(proto::VarDesc::LOD_TENSOR);
       }
     }
   }
 }
 
 CompileTimeInferShapeContext::CompileTimeInferShapeContext(
-    const OpDescBind &op, const BlockDescBind &block)
+    const OpDesc &op, const BlockDesc &block)
     : op_(op), block_(block) {}
 
 bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
@@ -466,7 +475,12 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
   auto var = block_.FindVarRecursive(name);
   PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
   try {
-    return framework::make_ddim(var->Shape());
+    auto shape = var->Shape();
+    if (shape.empty()) {
+      return framework::make_ddim({0UL});
+    } else {
+      return framework::make_ddim(var->Shape());
+    }
   } catch (...) {
     VLOG(5) << "GetDim of variable " << name << " error";
     std::rethrow_exception(std::current_exception());
@@ -479,7 +493,7 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
 }
 bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
 
-VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
+proto::VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
     const std::string &name) const {
   return block_.FindVarRecursive(name)->GetType();
 }
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index da032319afa775571d3942bf6ae415db7d233735..a5ffb162928bfd355d35d3f9b63aab59a88dd061 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -23,19 +23,21 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class BlockDescBind;
-class ProgramDescBind;
+class BlockDesc;
+class ProgramDesc;
 
-class OpDescBind {
+class OpDesc {
  public:
-  OpDescBind() {}
+  OpDesc() {}
 
-  OpDescBind(const std::string &type, const VariableNameMap &inputs,
-             const VariableNameMap &outputs, const AttributeMap &attrs);
+  OpDesc(const std::string &type, const VariableNameMap &inputs,
+         const VariableNameMap &outputs, const AttributeMap &attrs);
 
-  OpDescBind(const OpDesc &desc, ProgramDescBind *prog);
+  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog);
 
-  OpDesc *Proto();
+  void CopyFrom(const OpDesc &op_desc);
+
+  proto::OpDesc *Proto();
 
   std::string Type() const { return desc_.type(); }
 
@@ -59,13 +61,13 @@ class OpDescBind {
     return attrs_.find(name) != attrs_.end();
   }
 
-  AttrType GetAttrType(const std::string &name) const;
+  proto::AttrType GetAttrType(const std::string &name) const;
 
   std::vector<std::string> AttrNames() const;
 
   void SetAttr(const std::string &name, const Attribute &v);
 
-  void SetBlockAttr(const std::string &name, BlockDescBind &block);
+  void SetBlockAttr(const std::string &name, BlockDesc &block);
 
   Attribute GetAttr(const std::string &name) const;
 
@@ -107,9 +109,9 @@ class OpDescBind {
 
   void CheckAttrs();
 
-  void InferShape(const BlockDescBind &block) const;
+  void InferShape(const BlockDesc &block) const;
 
-  void InferVarType(BlockDescBind *block) const;
+  void InferVarType(BlockDesc *block) const;
 
   void MarkAsTarget() { desc_.set_is_target(true); }
 
@@ -126,8 +128,10 @@ class OpDescBind {
     return ret_val;
   }
 
-  OpDesc desc_;
+  proto::OpDesc desc_;
+  // input arg name => input variable names
   VariableNameMap inputs_;
+  // output arg name => output variable names
   VariableNameMap outputs_;
   AttributeMap attrs_;
 
diff --git a/paddle/framework/op_info.cc b/paddle/framework/op_info.cc
index 81ba29797c5f478e5d6a91236f3e8de1e6b43e49..b520108109bb2f72b80f83559fa065a5ca58e9e1 100644
--- a/paddle/framework/op_info.cc
+++ b/paddle/framework/op_info.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/op_info.h"
 
diff --git a/paddle/framework/op_info.h b/paddle/framework/op_info.h
index d3b1a3b5fa2cf8f6a9571e92a319f3757666657e..d9b89f9cac9611fcecb18bef87940632df1e2234 100644
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <functional>
@@ -34,7 +34,7 @@ class InferShapeBase {
 struct OpInfo {
   OpCreator creator_;
   GradOpMakerFN grad_op_maker_;
-  OpProto* proto_{nullptr};
+  proto::OpProto* proto_{nullptr};
   OpAttrChecker* checker_{nullptr};
   InferVarTypeFN infer_var_type_;
   InferShapeFN infer_shape_;
@@ -43,7 +43,7 @@ struct OpInfo {
     return proto_ != nullptr && checker_ != nullptr;
   }
 
-  const OpProto& Proto() const {
+  const proto::OpProto& Proto() const {
     PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
     PADDLE_ENFORCE(proto_->IsInitialized(),
                    "Operator Proto must be initialized in op info");
diff --git a/paddle/framework/op_kernel_type.h b/paddle/framework/op_kernel_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..312bd5f892ac23c847c87388c9cadf2161028d3e
--- /dev/null
+++ b/paddle/framework/op_kernel_type.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/data_layout.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/library_type.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+struct OpKernelType {
+  struct Hash {
+    size_t operator()(const OpKernelType& key) const {
+      int place = key.place_.which();
+      int data_type = static_cast<int>(key.data_type_) << LEFT_SHIFT;
+      int data_layout = static_cast<int>(key.data_layout_) << (LEFT_SHIFT * 2);
+      int library_type = static_cast<int>(key.library_type_)
+                         << (LEFT_SHIFT * 3);
+
+      std::hash<int> hasher;
+      return hasher(place + data_type + data_layout + library_type);
+    }
+  };
+
+  // place, data_type, library_type kinds less than 2^8
+  constexpr static int LEFT_SHIFT = 8;
+
+  proto::DataType data_type_;
+  DataLayout data_layout_;
+  platform::Place place_;
+  LibraryType library_type_;
+
+  OpKernelType(proto::DataType data_type, platform::Place place,
+               DataLayout data_layout = DataLayout::kAnyLayout,
+               LibraryType library_type = LibraryType::kPlain)
+      : data_type_(data_type),
+        data_layout_(data_layout),
+        place_(place),
+        library_type_(library_type) {}
+
+  OpKernelType(proto::DataType data_type,
+               const platform::DeviceContext& dev_ctx,
+               DataLayout data_layout = DataLayout::kAnyLayout,
+               LibraryType library_type = LibraryType::kPlain)
+      : data_type_(data_type),
+        data_layout_(data_layout),
+        place_(dev_ctx.GetPlace()),
+        library_type_(library_type) {}
+
+  bool operator==(const OpKernelType& o) const {
+    return platform::places_are_same_class(place_, o.place_) &&
+           data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
+           library_type_ == o.library_type_;
+  }
+
+  bool operator!=(const OpKernelType& o) const { return !(*this == o); }
+};
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const OpKernelType& kernel_key) {
+  os << "data_type[" << kernel_key.data_type_ << "]:data_layout["
+     << kernel_key.data_layout_ << "]:place[" << kernel_key.place_
+     << "]:library_type[" << kernel_key.library_type_ << "]";
+  return os;
+}
+
+inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
+  std::ostringstream stream;
+  stream << kernel_key;
+  return stream.str();
+}
+
+inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {
+  return (!platform::places_are_same_class(l.place_, r.place_)) ||
+         (l.data_type_ != r.data_type_) || (l.data_layout_ != r.data_layout_);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_kernel_type_test.cc b/paddle/framework/op_kernel_type_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..649afeee8a846b0579545f2edff77e9dbe3b4dd8
--- /dev/null
+++ b/paddle/framework/op_kernel_type_test.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_kernel_type.h"
+#include <gtest/gtest.h>
+#include <iostream>
+
+TEST(OpKernelType, ToString) {
+  using OpKernelType = paddle::framework::OpKernelType;
+  using DataType = paddle::framework::proto::DataType;
+  using CPUPlace = paddle::platform::CPUPlace;
+  using DataLayout = paddle::framework::DataLayout;
+  using LibraryType = paddle::framework::LibraryType;
+
+  OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
+                              LibraryType::kCUDNN);
+
+  ASSERT_EQ(
+      paddle::framework::KernelTypeToString(op_kernel_type),
+      "data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]");
+}
+
+TEST(OpKernelType, Hash) {
+  using OpKernelType = paddle::framework::OpKernelType;
+  using DataType = paddle::framework::proto::DataType;
+  using CPUPlace = paddle::platform::CPUPlace;
+  using CUDAPlace = paddle::platform::CUDAPlace;
+  using DataLayout = paddle::framework::DataLayout;
+  using LibraryType = paddle::framework::LibraryType;
+
+  OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
+                                LibraryType::kCUDNN);
+  OpKernelType op_kernel_type_2(DataType::FP32, CUDAPlace(0), DataLayout::kNCHW,
+                                LibraryType::kCUDNN);
+
+  OpKernelType::Hash hasher;
+  ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2));
+}
diff --git a/paddle/framework/op_proto_maker.h b/paddle/framework/op_proto_maker.h
index 44e8ab16895cc604f85bb83e240eab55739f8ba0..efd3a5ca535403d8d46a73adc899d914623b53e4 100644
--- a/paddle/framework/op_proto_maker.h
+++ b/paddle/framework/op_proto_maker.h
@@ -22,6 +22,8 @@ namespace framework {
 // this class not only make proto but also init attribute checkers.
 class OpProtoAndCheckerMaker {
  public:
+  using OpProto = proto::OpProto;
+  using OpAttrChecker = framework::OpAttrChecker;
   OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : proto_(proto), op_checker_(op_checker) {}
 
@@ -80,7 +82,7 @@ class OpProtoAndCheckerMaker {
 
 class NOPMaker : public OpProtoAndCheckerMaker {
  public:
-  NOPMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  NOPMaker(OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {}
 };
 
diff --git a/paddle/framework/op_proto_maker_test.cc b/paddle/framework/op_proto_maker_test.cc
index 988a14cf4de8fdf052ca7e8c41bff0c05ba2daaa..f16cb6fa3aa095a6d9737d84c7ce58f385a7072b 100644
--- a/paddle/framework/op_proto_maker_test.cc
+++ b/paddle/framework/op_proto_maker_test.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 
 class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
  public:
-  TestAttrProtoMaker(paddle::framework::OpProto* proto,
+  TestAttrProtoMaker(paddle::framework::proto::OpProto* proto,
                      paddle::framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddAttr<float>("scale", "scale of test op");
@@ -27,7 +27,7 @@ class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };
 
 TEST(ProtoMaker, DuplicatedAttr) {
-  paddle::framework::OpProto op_proto;
+  paddle::framework::proto::OpProto op_proto;
   paddle::framework::OpAttrChecker op_checker;
   auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
   ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
@@ -35,7 +35,7 @@ TEST(ProtoMaker, DuplicatedAttr) {
 
 class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
  public:
-  TestInOutProtoMaker(paddle::framework::OpProto* proto,
+  TestInOutProtoMaker(paddle::framework::proto::OpProto* proto,
                       paddle::framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("input", "input of test op");
@@ -44,7 +44,7 @@ class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };
 
 TEST(ProtoMaker, DuplicatedInOut) {
-  paddle::framework::OpProto op_proto;
+  paddle::framework::proto::OpProto op_proto;
   paddle::framework::OpAttrChecker op_checker;
   auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
   ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
index 8dedd873aad648174b770b84e5232cd17b577e72..dfa151316daeccfe92e26818165a694b78b5df62 100644
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -31,7 +31,8 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
 }
 
 static VariableNameMap ConvertOpDescVarsToVarNameMap(
-    const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars) {
+    const google::protobuf::RepeatedPtrField<proto::OpDesc::Var>&
+        op_desc_vars) {
   VariableNameMap ret_val;
   for (auto& var : op_desc_vars) {
     auto& var_names = ret_val[var.parameter()];
@@ -43,9 +44,10 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap(
   return ret_val;
 }
 
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
+    const proto::OpDesc& op_desc) {
   VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be"
-             "used in unit tests. Use CreateOp(const OpDescBind& op_desc) "
+             "used in unit tests. Use CreateOp(const OpDesc& op_desc) "
              "instead.";
   VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
   VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
@@ -57,7 +59,7 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
   return CreateOp(op_desc.type(), inputs, outputs, attrs);
 }
 
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDescBind& op_desc) {
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
   return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(),
                   op_desc.GetAttrMap());
 }
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index daade439e5232f06be72bc5bb1e2285124f2c3a4..d75c0233e8e0134ddf4edc50c07490a234b65cd0 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -37,8 +37,8 @@ class Registrar {
  public:
   // In our design, various kinds of classes, e.g., operators and kernels,
   // have their corresponding registry and registrar. The action of
-  // registration is in the constructor of a global registrar variable, which,
-  // however, are not used in the code that calls package framework, and would
+  // registration is in the constructor of a global registrar variable, which
+  // are not used in the code that calls package framework, and would
   // be removed from the generated binary file by the linker. To avoid such
   // removal, we add Touch to all registrar classes and make USE_OP macros to
   // call this method. So, as long as the callee code calls USE_OP, the global
@@ -61,25 +61,14 @@ struct OperatorRegistrar : public Registrar {
 
 class OpRegistry {
  public:
-  template <typename OpType, typename ProtoMakerType, typename GradOpType>
-  static void RegisterOp(const std::string& op_type,
-                         const std::string& grad_op_type) {
-    OperatorRegistrar<OpType, ProtoMakerType> reg(op_type.c_str());
-    reg.info.grad_op_type_ = grad_op_type;
-    // register gradient op
-    if (!grad_op_type.empty()) {
-      OperatorRegistrar<GradOpType> grad_reg(grad_op_type.c_str());
-    }
-  }
-
   static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
                                                 const VariableNameMap& inputs,
                                                 const VariableNameMap& outputs,
                                                 AttributeMap attrs);
 
-  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
+  static std::unique_ptr<OperatorBase> CreateOp(const proto::OpDesc& op_desc);
 
-  static std::unique_ptr<OperatorBase> CreateOp(const OpDescBind& op_desc);
+  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
 };
 
 template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
@@ -90,30 +79,31 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
   using KERNEL_TYPE =
       typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;
 
-  void operator()(const char* op_type) const {
+  void operator()(const char* op_type, const char* library_type) const {
     using T = typename KERNEL_TYPE::ELEMENT_TYPE;
-    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType());
+    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
+                     DataLayout::kAnyLayout, StringToLibraryType(library_type));
     OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
 
     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
     OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
         func;
-    func(op_type);
+    func(op_type, library_type);
   }
 };
 
 template <typename PlaceType, size_t I, typename... KernelType>
 struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
-  void operator()(const char* op_type) const {}
+  void operator()(const char* op_type, const char* library_type) const {}
 };
 
 // User can register many kernel in one place. The data type could be different.
 template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
  public:
-  explicit OpKernelRegistrar(const char* op_type) {
+  explicit OpKernelRegistrar(const char* op_type, const char* library_type) {
     OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
-    func(op_type);
+    func(op_type, library_type);
   }
 };
 
@@ -126,6 +116,14 @@ class OpKernelRegistrar : public Registrar {
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
+/*
+  The variadic arguments should be class types derived from one of the
+  following classes:
+    OpProtoAndCheckerMaker
+    GradOpDescMakerBase
+    VarTypeInference
+    InferShapeBase
+*/
 #define REGISTER_OPERATOR(op_type, op_class, ...)                      \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
       __reg_op__##op_type,                                             \
@@ -144,20 +142,29 @@ class OpKernelRegistrar : public Registrar {
   }
 
 /**
- * Macro to register Operator.
+ * Macro to register Operator. When the input is duplicable, you should
+ * use REGISTER_OP_EX with deop_empty_grad=false instead.
  */
-#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,       \
-                    grad_op_class)                                         \
-  REGISTER_OPERATOR(grad_op_type, grad_op_class);                          \
-  class _GradOpDescMaker_##grad_op_type##_                                 \
-      : public ::paddle::framework::DefaultGradOpDescMaker<true> {         \
-    using ::paddle::framework::DefaultGradOpDescMaker<                     \
-        true>::DefaultGradOpDescMaker;                                     \
-                                                                           \
-   protected:                                                              \
-    virtual std::string GradOpType() const { return #grad_op_type; }       \
-  };                                                                       \
-  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \
+#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
+                    grad_op_class)                                   \
+  REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,    \
+                 grad_op_class, true)
+
+// When an argument is duplicable, we need to use this version.
+// Perhaps we can omit DropEmptyIG template parameter and
+// only have one version of REGISTER_OP.
+#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,       \
+                       grad_op_class, drop_empty_grad)                        \
+  REGISTER_OPERATOR(grad_op_type, grad_op_class);                             \
+  class _GradOpDescMaker_##grad_op_type##_                                    \
+      : public ::paddle::framework::DefaultGradOpDescMaker<drop_empty_grad> { \
+    using ::paddle::framework::DefaultGradOpDescMaker<                        \
+        drop_empty_grad>::DefaultGradOpDescMaker;                             \
+                                                                              \
+   protected:                                                                 \
+    virtual std::string GradOpType() const { return #grad_op_type; }          \
+  };                                                                          \
+  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_,    \
                     op_maker_class);
 
 #define REGISTER_OP_WITH_KERNEL(op_type, ...)                         \
@@ -175,14 +182,15 @@ class OpKernelRegistrar : public Registrar {
       __reg_op_kernel_##op_type##_##DEVICE_TYPE##__,                      \
       "REGISTER_OP_KERNEL must be called in global namespace");           \
   static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__> \
-      __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__(#op_type);      \
+      __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__(#op_type,       \
+                                                          #DEVICE_TYPE);  \
   int TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE() {                \
     __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__.Touch();          \
     return 0;                                                             \
   }
 
-#define REGISTER_OP_GPU_KERNEL(op_type, ...) \
-  REGISTER_OP_KERNEL(op_type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
+#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
 
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
@@ -217,7 +225,7 @@ class OpKernelRegistrar : public Registrar {
 #else
 #define USE_OP_KERNEL(op_type)        \
   USE_OP_DEVICE_KERNEL(op_type, CPU); \
-  USE_OP_DEVICE_KERNEL(op_type, GPU)
+  USE_OP_DEVICE_KERNEL(op_type, CUDA)
 #endif
 
 #define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
@@ -226,9 +234,9 @@ class OpKernelRegistrar : public Registrar {
   USE_OP_ITSELF(op_type);        \
   USE_OP_DEVICE_KERNEL(op_type, CPU);
 
-#define USE_GPU_ONLY_OP(op_type) \
-  USE_OP_ITSELF(op_type);        \
-  USE_OP_DEVICE_KERNEL(op_type, GPU)
+#define USE_CUDA_ONLY_OP(op_type) \
+  USE_OP_ITSELF(op_type);         \
+  USE_OP_DEVICE_KERNEL(op_type, CUDA)
 
 #define USE_OP(op_type)   \
   USE_OP_ITSELF(op_type); \
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index b860fe6cac773d1e85adecc43f5dfec42b6c7661..341da8befd45abd1a3fc86581be33319a8791567 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -1,15 +1,31 @@
-#include "paddle/framework/op_registry.h"
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "paddle/framework/op_registry.h"
+
 namespace pd = paddle::framework;
 
 namespace paddle {
 namespace framework {
+
 class CosineOp : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
+  void Run(const Scope& scope, const platform::Place& place) const override {}
 };
 
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -28,8 +44,7 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 class MyTestOp : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
+  void Run(const Scope& scope, const platform::Place& place) const override {}
 };
 
 class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -51,7 +66,7 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 
 static void BuildVar(const std::string& param_name,
                      std::initializer_list<const char*> arguments,
-                     paddle::framework::OpDesc::Var* var) {
+                     paddle::framework::proto::OpDesc::Var* var) {
   var->set_parameter(param_name);
   for (auto& arg_name : arguments) {
     var->add_arguments(arg_name);
@@ -63,7 +78,7 @@ REGISTER_OP_WITHOUT_GRADIENT(my_test_op, paddle::framework::MyTestOp,
                              paddle::framework::MyTestOpProtoAndCheckerMaker);
 
 TEST(OpRegistry, CreateOp) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("cos_sim");
   BuildVar("input", {"aa"}, op_desc.add_inputs());
   BuildVar("output", {"bb"}, op_desc.add_outputs());
@@ -71,26 +86,26 @@ TEST(OpRegistry, CreateOp) {
   float scale = 3.3;
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(scale);
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::framework::Scope scope;
-  paddle::platform::CPUDeviceContext dev_ctx;
-  op->Run(scope, dev_ctx);
+  paddle::platform::CPUPlace cpu_place;
+  op->Run(scope, cpu_place);
   float scale_get = op->Attr<float>("scale");
   ASSERT_EQ(scale_get, scale);
 }
 
 TEST(OpRegistry, IllegalAttr) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("cos_sim");
   BuildVar("input", {"aa"}, op_desc.add_inputs());
   BuildVar("output", {"bb"}, op_desc.add_outputs());
 
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(-2.0);
 
   bool caught = false;
@@ -108,7 +123,7 @@ TEST(OpRegistry, IllegalAttr) {
 }
 
 TEST(OpRegistry, DefaultValue) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("cos_sim");
   BuildVar("input", {"aa"}, op_desc.add_inputs());
   BuildVar("output", {"bb"}, op_desc.add_outputs());
@@ -117,13 +132,13 @@ TEST(OpRegistry, DefaultValue) {
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::framework::Scope scope;
-  paddle::platform::CPUDeviceContext dev_ctx;
-  op->Run(scope, dev_ctx);
+  paddle::platform::CPUPlace cpu_place;
+  op->Run(scope, cpu_place);
   ASSERT_EQ(op->Attr<float>("scale"), 1.0);
 }
 
 TEST(OpRegistry, CustomChecker) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("my_test_op");
   BuildVar("input", {"ii"}, op_desc.add_inputs());
   BuildVar("output", {"oo"}, op_desc.add_outputs());
@@ -145,7 +160,7 @@ TEST(OpRegistry, CustomChecker) {
   // set 'test_attr' set to an illegal value
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("test_attr");
-  attr->set_type(paddle::framework::AttrType::INT);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
   attr->set_i(3);
   caught = false;
   try {
@@ -164,12 +179,12 @@ TEST(OpRegistry, CustomChecker) {
   op_desc.mutable_attrs()->Clear();
   attr = op_desc.mutable_attrs()->Add();
   attr->set_name("test_attr");
-  attr->set_type(paddle::framework::AttrType::INT);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
   attr->set_i(4);
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  paddle::platform::CPUDeviceContext dev_ctx;
+  paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
-  op->Run(scope, dev_ctx);
+  op->Run(scope, cpu_place);
   int test_attr = op->Attr<int>("test_attr");
   ASSERT_EQ(test_attr, 4);
 }
@@ -184,3 +199,175 @@ TEST(OperatorRegistrar, Test) {
   using namespace paddle::framework;
   OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
 }
+
+namespace paddle {
+namespace framework {
+
+class OpKernelTestMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddComment("NoGradOp, same input output. no Grad");
+  }
+};
+
+class OpWithKernelTest : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(proto::DataType::FP32, ctx.device_context());
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OpKernelTest : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {}
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(op_with_kernel,
+                             paddle::framework::OpWithKernelTest,
+                             paddle::framework::OpKernelTestMaker);
+REGISTER_OP_CPU_KERNEL(
+    op_with_kernel,
+    paddle::framework::OpKernelTest<paddle::platform::CPUDeviceContext, float>);
+
+REGISTER_OP_CUDA_KERNEL(op_with_kernel,
+                        paddle::framework::OpKernelTest<
+                            paddle::platform::CUDADeviceContext, float>);
+
+TEST(OperatorRegistrar, CPU) {
+  paddle::framework::proto::OpDesc op_desc;
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+
+  op_desc.set_type("op_with_kernel");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+  op->Run(scope, cpu_place);
+}
+
+TEST(OperatorRegistrar, CUDA) {
+  paddle::framework::proto::OpDesc op_desc;
+  paddle::platform::CUDAPlace cuda_place(0);
+  paddle::framework::Scope scope;
+
+  op_desc.set_type("op_with_kernel");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+  op->Run(scope, cuda_place);
+}
+
+static int op_test_value = 0;
+
+using paddle::platform::DeviceContext;
+using paddle::platform::CPUDeviceContext;
+using paddle::platform::CUDADeviceContext;
+
+namespace paddle {
+namespace framework {
+
+class OpWithMultiKernelTest : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        proto::DataType::FP32, platform::CUDAPlace(0), DataLayout::kAnyLayout,
+        framework::LibraryType::kCUDNN);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OpMultiKernelTest : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const;
+};
+
+template <typename T>
+class OpMultiKernelTest<CPUDeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    ++op_test_value;
+  }
+};
+
+template <typename T>
+class OpMultiKernelTest<CUDADeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    --op_test_value;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OpMultiKernelTest2 : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const;
+};
+
+template <typename T>
+class OpMultiKernelTest2<CPUDeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    op_test_value += 10;
+  }
+};
+
+template <typename T>
+class OpMultiKernelTest2<CUDADeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    op_test_value -= 10;
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(op_with_multi_kernel,
+                             paddle::framework::OpWithMultiKernelTest,
+                             paddle::framework::OpKernelTestMaker);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, CPU, paddle::platform::CPUPlace,
+    paddle::framework::OpMultiKernelTest<CPUDeviceContext, float>);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, MKLDNN, paddle::platform::CPUPlace,
+    paddle::framework::OpMultiKernelTest2<CPUDeviceContext, float>);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, CUDA, paddle::platform::CUDAPlace,
+    paddle::framework::OpMultiKernelTest<CUDADeviceContext, float>);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, CUDNN, paddle::platform::CUDAPlace,
+    paddle::framework::OpMultiKernelTest2<CUDADeviceContext, float>);
+
+TEST(OperatorRegistrar, OpWithMultiKernel) {
+  paddle::framework::proto::OpDesc op_desc;
+  paddle::platform::CUDAPlace cuda_place(0);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+
+  op_desc.set_type("op_with_multi_kernel");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+  // TODO(qiao) add priority back
+  // use all available kernels
+  op->Run(scope, cuda_place);
+  EXPECT_EQ(op_test_value, -10);
+}
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 93467ab8ac796277b47a861a427de2837fb2d3d4..84c010df7c396fc21904ae3c980f5fad70b2ceac 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -11,30 +11,60 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <gflags/gflags.h>
+#include <glog/logging.h>
 
-#include "paddle/framework/operator.h"
 #include <algorithm>
-#include <atomic>
-#include "paddle/framework/lod_tensor_array.h"
+
+#include "paddle/framework/data_transform.h"
+#include "paddle/framework/executor.h"
+#include "paddle/framework/operator.h"
 #include "paddle/framework/shape_inference.h"
 #include "paddle/framework/var_type.h"
 
+DEFINE_bool(op_sync, false,
+            "Default cuda is asynchronous device, set to True will"
+            "force op run in synchronous mode.");
+
 namespace paddle {
 namespace framework {
 
-template <>
-Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
-    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return *device_context_.GetEigenDevice<platform::CPUPlace>();
+std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
+    std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
+    std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
+    std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN),
+    std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
+};
+
+static DDim GetDims(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) {
+    return DDim({-1});
+  }
+
+  if (var->IsType<LoDTensor>()) {
+    return var->Get<LoDTensor>().dims();
+  } else if (var->IsType<SelectedRows>()) {
+    return var->Get<SelectedRows>().GetCompleteDims();
+  } else {
+    return DDim({-1});
+  }
 }
 
-#ifdef PADDLE_WITH_CUDA
-template <>
-Eigen::GpuDevice&
-ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return *device_context_.GetEigenDevice<platform::GPUPlace>();
+static LoD GetLoD(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  auto default_lod = LoD({{}});
+
+  if (var == nullptr) {
+    return default_lod;
+  }
+
+  if (var->IsType<LoDTensor>()) {
+    return var->Get<LoDTensor>().lod();
+  } else {
+    return default_lod;
+  }
 }
-#endif
 
 std::string OperatorBase::Input(const std::string& name) const {
   auto& ins = Inputs(name);
@@ -68,7 +98,7 @@ const std::vector<std::string>& OperatorBase::Outputs(
   return it->second;
 }
 
-std::string OperatorBase::DebugString() const {
+std::string OperatorBase::DebugStringEx(const Scope* scope) const {
   std::stringstream ss;
   ss << "Op(" << type_ << "), inputs:{";
   for (auto it = inputs_.begin(); it != inputs_.end();) {
@@ -76,6 +106,10 @@ std::string OperatorBase::DebugString() const {
     ss << input.first << "[";
     for (size_t i = 0; i < input.second.size(); ++i) {
       ss << input.second[i];
+      if (scope) {
+        ss << "[" << GetDims(*scope, input.second[i]) << "]";
+        ss << "(" << GetLoD(*scope, input.second[i]) << ")";
+      }
       if (i != input.second.size() - 1) {
         ss << ", ";
       }
@@ -92,6 +126,10 @@ std::string OperatorBase::DebugString() const {
     ss << output.first << "[";
     for (size_t i = 0; i < output.second.size(); ++i) {
       ss << output.second[i];
+      if (scope) {
+        ss << "[" << GetDims(*scope, output.second[i]) << "]";
+        ss << "(" << GetLoD(*scope, output.second[i]) << ")";
+      }
       if (i != output.second.size() - 1) {
         ss << ", ";
       }
@@ -189,34 +227,37 @@ void OperatorBase::GenerateTemporaryNames() {
   }
 }
 
-static const Tensor* GetTensorFromVar(const Variable* var) {
-  const Tensor* t = nullptr;
+static bool VarIsTensor(const Variable* var) {
+  return var->IsType<LoDTensor>() || var->IsType<SelectedRows>();
+}
+
+static const Tensor* GetTensorFromVar(Variable* var) {
   if (var->IsType<LoDTensor>()) {
-    t = &(var->Get<LoDTensor>());
+    return var->GetMutable<LoDTensor>();
   } else if (var->IsType<SelectedRows>()) {
-    t = &(var->Get<SelectedRows>().value());
+    return var->GetMutable<SelectedRows>()->mutable_value();
   } else {
-    PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
+                 var->Type().name());
   }
-  return t;
 }
 
 static Tensor* GetMutableTensorFromVar(Variable* var) {
-  Tensor* t = nullptr;
   if (var->IsType<LoDTensor>()) {
-    t = var->GetMutable<LoDTensor>();
+    return var->GetMutable<LoDTensor>();
   } else if (var->IsType<SelectedRows>()) {
-    t = var->GetMutable<SelectedRows>()->mutable_value();
+    return var->GetMutable<SelectedRows>()->mutable_value();
   } else {
-    PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
+                 var->Type().name());
   }
-  return t;
 }
 
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
   auto* var = InputVar(name);
-  return var == nullptr ? nullptr : GetTensorFromVar(var);
+  return var == nullptr ? nullptr
+                        : GetTensorFromVar(const_cast<Variable*>(var));
 }
 
 template <>
@@ -254,17 +295,12 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
   return res;
 }
 
-std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key) {
-  os << "place[" << kernel_key.place_ << "]:data_type[" << kernel_key.data_type_
-     << "]";
-  return os;
-}
-
 bool OpSupportGPU(const std::string& op_type) {
   auto& all_kernels = OperatorWithKernel::AllOpKernels();
   auto it = all_kernels.find(op_type);
   if (it == all_kernels.end()) {
     // All control operator must support GPU
+
     return true;
   }
   for (auto& kern_pair : it->second) {
@@ -364,6 +400,25 @@ class RuntimeInferShapeContext : public InferShapeContext {
     auto in_tensor = in_var->Get<LoDTensor>();
     auto* out_tensor = out_var->GetMutable<LoDTensor>();
     out_tensor->set_lod(in_tensor.lod());
+
+    // TODO(dzhwinter) : reuse ShareLoD in most operators.
+    // Need to call ShareLayout explicitly in sequence related ops.
+    // Shall we have a better method to shared info between in/out Tensor?
+    out_tensor->set_layout(in_tensor.layout());
+  }
+
+  void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
+                   size_t j = 0) const {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
+    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+    if (!in_var->IsType<LoDTensor>()) return;
+    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    auto in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
+    out_tensor->set_layout(in_tensor.layout());
   }
 
   bool IsRuntime() const override { return true; }
@@ -376,7 +431,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
     } else if (var->IsType<SelectedRows>()) {
       return var->Get<SelectedRows>().GetCompleteDims();
     } else {
-      PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
+                   name, var->Type().name());
     }
   }
 
@@ -387,11 +443,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
     } else if (var->IsType<SelectedRows>()) {
       var->GetMutable<SelectedRows>()->set_height(dim[0]);
     } else {
-      PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
+                   name, var->Type().name());
     }
   }
 
-  VarDesc::VarType GetVarType(const std::string& name) const override {
+  proto::VarDesc::VarType GetVarType(const std::string& name) const override {
     auto* var = scope_.FindVar(name);
     return ToVarType(var->Type());
   }
@@ -402,11 +459,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
 };
 
 void OperatorWithKernel::Run(const Scope& scope,
-                             const platform::DeviceContext& dev_ctx) const {
+                             const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
   this->InferShape(&infer_shape_ctx);
-
-  ExecutionContext ctx(*this, scope, dev_ctx);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto dev_ctx = pool.Get(place);
 
   // check if op[type] has kernel registered.
   auto& all_op_kernels = AllOpKernels();
@@ -416,25 +473,66 @@ void OperatorWithKernel::Run(const Scope& scope,
         "There are no kernels which are registered in the %s operator.", type_);
   }
 
-  // check if op[type] have kernel for kernel_key
+  ExecutionContext ctx(*this, scope, *dev_ctx);
+
   OpKernelMap& kernels = kernels_iter->second;
-  auto kernel_key = GetKernelType(ctx);
-  auto kernel_iter = kernels.find(kernel_key);
 
-  if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("The operator %s does not support %s", type_, kernel_key);
+  // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
+  // transform functions are ready.
+
+  // for (auto& candidate : kKernelPriority) {
+  //   Do selection
+  // }
+
+  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
+
+  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+  Scope& new_scope = scope.NewScope();
+
+  for (auto& var_name_item : this->Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      auto* var = scope.FindVar(var_name);
+      if (var && VarIsTensor(var)) {
+        auto* tensor_in = GetTensorFromVar(var);
+        if (tensor_in->IsInitialized()) {
+          auto kernel_type_for_var = this->GetKernelTypeForVar(
+              var_name_item.first, *tensor_in, expected_kernel_key);
+          if (TransFromNeeded(kernel_type_for_var, expected_kernel_key)) {
+            auto out_var_names = OutputVars(true);
+            if (std::find(out_var_names.begin(), out_var_names.end(),
+                          var_name) != out_var_names.end()) {
+              PADDLE_THROW(
+                  "var %s is both input and output, "
+                  "does not support transform",
+                  var_name);
+            }
+            VLOG(3) << "Transform Variable " << var_name << " from "
+                    << kernel_type_for_var << " to " << expected_kernel_key;
+            auto* trans_var = new_scope.Var(var_name);
+            std::shared_ptr<Tensor> out(new Tensor);
+            DataTransform(expected_kernel_key, kernel_type_for_var, *tensor_in,
+                          out.get());
+            CopyVariableWithTensor(*var, *(out.get()), *trans_var);
+          }
+        }
+      }
+    }
   }
 
-  kernel_iter->second->Compute(ctx);
+  auto kernel_iter = kernels.find(expected_kernel_key);
 
-  // throws errors if have.
-  dev_ctx.Finish();
-}
-OpKernelType OperatorWithKernel::GetKernelType(
-    const ExecutionContext& ctx) const {
-  return OpKernelType(IndicateDataType(ctx), ctx.device_context());
+  auto* new_dev_ctx = pool.Get(expected_kernel_key.place_);
+  kernel_iter->second->Compute(
+      ExecutionContext(*this, new_scope, *new_dev_ctx));
+
+  /*For profiling/benchmark only*/
+  if (FLAGS_op_sync) {
+    new_dev_ctx->Wait();
+  }
 }
-DataType OperatorWithKernel::IndicateDataType(
+
+proto::DataType OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
   auto& scope = ctx.scope();
   int data_type = -1;
@@ -460,7 +558,18 @@ DataType OperatorWithKernel::IndicateDataType(
     }
   }
   PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
-  return static_cast<DataType>(data_type);
+  return static_cast<proto::DataType>(data_type);
+}
+
+OpKernelType OperatorWithKernel::GetExpectedKernelType(
+    const ExecutionContext& ctx) const {
+  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
+}
+
+OpKernelType OperatorWithKernel::GetKernelTypeForVar(
+    const std::string& var_name, const Tensor& tensor,
+    const OpKernelType& expected_kernel_type) const {
+  return OpKernelType(expected_kernel_type.data_type_, tensor.place());
 }
 
 }  // namespace framework
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 60861d92933dd100f877bec8d43f9b924f951e60..c9140f304c89e32a0fa8bd24722cc2e5dbc4e2e1 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -17,21 +17,21 @@ limitations under the License. */
 #include <algorithm>
 #include <atomic>
 #include <string>
+#include <tuple>
 #include <unordered_map>
 #include <vector>
 
 #include "glog/logging.h"  // For VLOG
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/block_desc.h"
-#include "paddle/framework/data_type.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_info.h"
+#include "paddle/framework/op_kernel_type.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/selected_rows.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device_context.h"
-#include "paddle/platform/place.h"
 #include "paddle/platform/variant.h"
 #include "paddle/utils/Error.h"
 
@@ -53,6 +53,10 @@ constexpr char kGradVarSuffix[] = "@GRAD";
 /// Variables with this suffix are supposed to be filled up with zeros.
 constexpr char kZeroVarSuffix[] = "@ZERO";
 
+// define some kernel priority
+/* Define multiple kernel type fallback order*/
+extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
+
 inline std::string GradVarName(const std::string& var_name) {
   return var_name + kGradVarSuffix;
 }
@@ -80,11 +84,16 @@ class OperatorBase {
     return boost::get<T>(attrs_.at(name));
   }
 
-  virtual std::string DebugString() const;
+  /// if scope is not null, also show dimensions of arguments
+  virtual std::string DebugStringEx(const Scope* scope) const;
+
+  std::string DebugString() const { return DebugStringEx(nullptr); }
 
   /// Net will call this function to Run an op.
-  virtual void Run(const Scope& scope,
-                   const platform::DeviceContext& dev_ctx) const = 0;
+  virtual void Run(const Scope& scope, const platform::Place& place) const = 0;
+
+  // FIXME(typhoonzero): this is only used for recv_op to stop event_loop.
+  virtual void Stop() {}
 
   virtual bool IsNetOp() const { return false; }
 
@@ -159,8 +168,7 @@ class OperatorBase {
 class NOP : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
+  void Run(const Scope& scope, const platform::Place& place) const override {}
   std::unique_ptr<OperatorBase> Clone() const override {
     return std::unique_ptr<OperatorBase>(new NOP(*this));
   }
@@ -276,17 +284,25 @@ class ExecutionContext {
     out_tensor->set_lod(in_tensor.lod());
   }
 
-  template <typename PlaceType,
-            typename DeviceType = typename platform::EigenDeviceConverter<
-                PlaceType>::EigenDeviceType>
-  DeviceType& GetEigenDevice() const;
-
   platform::Place GetPlace() const { return device_context_.GetPlace(); }
 
+  template <typename DeviceContextType>
+  const DeviceContextType& device_context() const {
+    return *reinterpret_cast<const DeviceContextType*>(&device_context_);
+  }
+
   const platform::DeviceContext& device_context() const {
     return device_context_;
   }
 
+#ifdef PADDLE_WITH_CUDA
+  const inline platform::CUDADeviceContext& cuda_device_context() const {
+    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
+    return *reinterpret_cast<const platform::CUDADeviceContext*>(
+        &device_context_);
+  }
+#endif
+
   //! Get actual name vector for this input.
   const std::vector<std::string>& Inputs(const std::string& name) const {
     return op_.Inputs(name);
@@ -297,14 +313,6 @@ class ExecutionContext {
     return op_.Outputs(name);
   }
 
-#ifdef PADDLE_WITH_CUDA
-  const inline platform::CUDADeviceContext& cuda_device_context() const {
-    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
-    return *reinterpret_cast<const platform::CUDADeviceContext*>(
-        &device_context_);
-  }
-#endif
-
  private:
   const OperatorBase& op_;
   const Scope& scope_;
@@ -345,33 +353,6 @@ class OpKernel : public OpKernelBase {
   using ELEMENT_TYPE = T;
 };
 
-struct OpKernelType {
-  struct Hash {
-    std::hash<int> hash_;
-    size_t operator()(const OpKernelType& key) const {
-      int place = key.place_.which();
-      int data_type = static_cast<int>(key.data_type_);
-      int pre_hash = data_type << NUM_PLACE_TYPE_LIMIT_IN_BIT |
-                     (place & ((1 << NUM_PLACE_TYPE_LIMIT_IN_BIT) - 1));
-      return hash_(pre_hash);
-    }
-  };
-
-  platform::Place place_;
-  DataType data_type_;
-
-  OpKernelType(DataType data_type, platform::Place place)
-      : place_(place), data_type_(data_type) {}
-
-  OpKernelType(DataType data_type, const platform::DeviceContext& dev_ctx)
-      : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
-
-  bool operator==(const OpKernelType& o) const {
-    return platform::places_are_same_class(place_, o.place_) &&
-           data_type_ == o.data_type_;
-  }
-};
-
 class OperatorWithKernel : public OperatorBase {
  public:
   using OpKernelMap =
@@ -382,8 +363,7 @@ class OperatorWithKernel : public OperatorBase {
                      const VariableNameMap& outputs, const AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const final;
+  void Run(const Scope& scope, const platform::Place& place) const final;
 
   static std::unordered_map<std::string /* op_type */, OpKernelMap>&
   AllOpKernels() {
@@ -404,16 +384,17 @@ class OperatorWithKernel : public OperatorBase {
   }
 
  protected:
-  virtual OpKernelType GetKernelType(const ExecutionContext& ctx) const;
+  virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
+  virtual OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const OpKernelType& expected_kernel_type) const;
 
  private:
   // indicate kernel DataType by input data. Defaultly all input data must be
   // same.
-  DataType IndicateDataType(const ExecutionContext& ctx) const;
+  proto::DataType IndicateDataType(const ExecutionContext& ctx) const;
 };
 
-std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key);
-
 extern bool OpSupportGPU(const std::string& op_type);
 
 }  // namespace framework
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 1e19f82b341768142258ba4a5dfa246d87ba4c43..b69d7c7a7406eb3e18d385c568cb9c21b9b4107b 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -11,11 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#include "paddle/framework/operator.h"
 #include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace framework {
@@ -27,8 +28,7 @@ class OpWithoutKernelTest : public OperatorBase {
   OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
                       const VariableNameMap& outputs, const AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs), x(1) {}
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+  void Run(const Scope& scope, const platform::Place& place) const override {
     ++op_run_num;
     ASSERT_EQ(static_cast<int>(inputs_.size()), 1);
     ASSERT_EQ(static_cast<int>(outputs_.size()), 1);
@@ -41,10 +41,9 @@ class OpWithoutKernelTest : public OperatorBase {
   int x{0};
 };
 
-class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto,
-                                           OpAttrChecker* op_checker)
+  OpWithoutKernelCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("input", "input of test op");
     AddOutput("output", "output of test op");
@@ -58,35 +57,36 @@ class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 
 static void BuildVar(const std::string& param_name,
                      std::initializer_list<const char*> arguments,
-                     paddle::framework::OpDesc::Var* var) {
+                     paddle::framework::proto::OpDesc::Var* var) {
   var->set_parameter(param_name);
   for (auto& arg_name : arguments) {
     *var->mutable_arguments()->Add() = arg_name;
   }
 }
 
-REGISTER_OP_WITHOUT_GRADIENT(
-    test_operator, paddle::framework::OpWithoutKernelTest,
-    paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);
+REGISTER_OP_WITHOUT_GRADIENT(test_operator,
+                             paddle::framework::OpWithoutKernelTest,
+                             paddle::framework::OpWithoutKernelCheckerMaker);
 
 TEST(OperatorBase, all) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::InitDevices();
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("test_operator");
   BuildVar("input", {"IN1"}, op_desc.add_inputs());
   BuildVar("output", {"OUT1"}, op_desc.add_outputs());
 
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(3.14);
 
-  paddle::platform::CPUDeviceContext device_context;
+  paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   scope.Var("OUT1");
   ASSERT_EQ(paddle::framework::op_run_num, 0);
-  op->Run(scope, device_context);
+  op->Run(scope, cpu_place);
   ASSERT_EQ(paddle::framework::op_run_num, 1);
 }
 
@@ -114,8 +114,9 @@ class OpWithKernelTest : public OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {}
-  OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
-    return OpKernelType(DataType::FP32, ctx.device_context());
+  OpKernelType GetExpectedKernelType(
+      const ExecutionContext& ctx) const override {
+    return OpKernelType(proto::DataType::FP32, ctx.GetPlace());
   }
 };
 
@@ -123,7 +124,6 @@ template <typename T1, typename T2>
 class CPUKernelTest : public OpKernel<float> {
  public:
   void Compute(const ExecutionContext& ctx) const {
-    std::cout << "this is cpu kernel" << std::endl;
     std::cout << ctx.op().DebugString() << std::endl;
     cpu_kernel_run_num++;
     ASSERT_EQ(ctx.op().Input("x"), "IN1");
@@ -195,22 +195,23 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,
 
 // test with single input
 TEST(OpKernel, all) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::InitDevices();
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("op_with_kernel");
   BuildVar("x", {"IN1"}, op_desc.add_inputs());
   BuildVar("y", {"OUT1"}, op_desc.add_outputs());
 
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(3.14);
 
-  paddle::platform::CPUDeviceContext cpu_device_context;
+  paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
-  op->Run(scope, cpu_device_context);
+  op->Run(scope, cpu_place);
   ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
 }
 
@@ -224,7 +225,9 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 TEST(OpKernel, multi_inputs) {
   using namespace paddle::framework;
 
-  OpDesc op_desc;
+  paddle::framework::InitDevices();
+  proto::OpDesc op_desc;
+
   op_desc.set_type("op_multi_inputs_with_kernel");
   BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
   BuildVar("k", {"k0"}, op_desc.add_inputs());
@@ -232,10 +235,10 @@ TEST(OpKernel, multi_inputs) {
 
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(3.14);
 
-  paddle::platform::CPUDeviceContext cpu_device_context;
+  paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
   scope.Var("x0")->GetMutable<LoDTensor>();
   scope.Var("x1")->GetMutable<LoDTensor>();
@@ -245,7 +248,7 @@ TEST(OpKernel, multi_inputs) {
   scope.Var("y1")->GetMutable<LoDTensor>();
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  op->Run(scope, cpu_device_context);
+  op->Run(scope, cpu_place);
 }
 
 class OperatorClone : public paddle::framework::OperatorBase {
@@ -257,11 +260,14 @@ class OperatorClone : public paddle::framework::OperatorBase {
                 const paddle::framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const paddle::framework::Scope& scope,
-           const paddle::platform::DeviceContext& dev_ctx) const override {}
+           const paddle::platform::Place& place) const override {}
 };
 
 TEST(Operator, Clone) {
-  OperatorClone a("ABC", {}, {}, {});
+  paddle::framework::InitDevices();
+  OperatorClone a("ABC", paddle::framework::VariableNameMap{},
+                  paddle::framework::VariableNameMap{},
+                  paddle::framework::AttributeMap{});
   auto b = a.Clone();
   ASSERT_EQ(a.Type(), b->Type());
 }
diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc
index 4af8d94563ad0ecf6fcc6fe0575b0f69006a9a2d..b5d9e5e385c1ba57169ef885824fc23b0f130692 100644
--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -18,49 +18,49 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-BlockDescBind *ProgramDescBind::AppendBlock(const BlockDescBind &parent) {
+BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
   auto *b = desc_.add_blocks();
   b->set_parent_idx(parent.ID());
   b->set_idx(desc_.blocks_size() - 1);
-  blocks_.emplace_back(new BlockDescBind(this, b));
+  blocks_.emplace_back(new BlockDesc(this, b));
   return blocks_.back().get();
 }
 
-ProgramDesc *ProgramDescBind::Proto() {
+proto::ProgramDesc *ProgramDesc::Proto() {
   for (auto &block : blocks_) {
     block->Flush();
   }
   return &desc_;
 }
 
-ProgramDescBind::ProgramDescBind() {
+ProgramDesc::ProgramDesc() {
   auto *block = desc_.mutable_blocks()->Add();
   block->set_idx(kRootBlockIndex);
   block->set_parent_idx(kNoneBlockIndex);
-  blocks_.emplace_back(new BlockDescBind(this, block));
+  blocks_.emplace_back(new BlockDesc(this, block));
 }
 
-ProgramDescBind::ProgramDescBind(const ProgramDescBind &o) {
+ProgramDesc::ProgramDesc(const ProgramDesc &o) {
   desc_ = o.desc_;
 
   for (int i = 0; i < desc_.blocks_size(); ++i) {
     auto *block = desc_.mutable_blocks(i);
-    blocks_.emplace_back(new BlockDescBind(*o.blocks_[i], block, this));
+    blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
   }
 }
 
-ProgramDescBind::ProgramDescBind(const ProgramDesc &desc) {
+ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
   desc_ = desc;
   for (auto &block_desc : *desc_.mutable_blocks()) {
-    blocks_.emplace_back(new BlockDescBind(this, &block_desc));
+    blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
 }
 
-ProgramDescBind::ProgramDescBind(const std::string &binary_str) {
+ProgramDesc::ProgramDesc(const std::string &binary_str) {
   PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
                  "Fail to parse program_desc from binary string.");
   for (auto &block_desc : *desc_.mutable_blocks()) {
-    blocks_.emplace_back(new BlockDescBind(this, &block_desc));
+    blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
 }
 
diff --git a/paddle/framework/program_desc.h b/paddle/framework/program_desc.h
index b1cb086de4345902482d8254b8aeec041ecf81bc..15a962bb696d6172acd1a83cf9bb1ffd0846d449 100644
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -23,32 +23,32 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class BlockDescBind;
+class BlockDesc;
 
-class ProgramDescBind {
+class ProgramDesc {
  public:
-  ProgramDescBind();
+  ProgramDesc();
 
-  explicit ProgramDescBind(const ProgramDesc &desc);
+  explicit ProgramDesc(const proto::ProgramDesc &desc);
 
-  ProgramDescBind(const ProgramDescBind &o);
+  ProgramDesc(const ProgramDesc &o);
 
-  explicit ProgramDescBind(const std::string &binary_str);
+  explicit ProgramDesc(const std::string &binary_str);
 
-  BlockDescBind *AppendBlock(const BlockDescBind &parent);
+  BlockDesc *AppendBlock(const BlockDesc &parent);
 
-  BlockDescBind *MutableBlock(size_t idx) { return blocks_[idx].get(); }
+  BlockDesc *MutableBlock(size_t idx) { return blocks_[idx].get(); }
 
-  const BlockDescBind &Block(size_t idx) const { return *blocks_[idx]; }
+  const BlockDesc &Block(size_t idx) const { return *blocks_[idx]; }
 
   size_t Size() const { return blocks_.size(); }
 
-  ProgramDesc *Proto();
+  proto::ProgramDesc *Proto();
 
  private:
-  ProgramDesc desc_;
+  proto::ProgramDesc desc_;
 
-  std::vector<std::unique_ptr<BlockDescBind>> blocks_;
+  std::vector<std::unique_ptr<BlockDesc>> blocks_;
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/program_desc_test.cc b/paddle/framework/program_desc_test.cc
index 83e7286e0ec3639fa589b0958922543a3ba16a00..59947c9f2189348226b7ff6c2b9315196bbf55fa 100644
--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/framework/program_desc_test.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/program_desc.h"
 #include "gtest/gtest.h"
@@ -19,18 +19,18 @@
 namespace paddle {
 namespace framework {
 TEST(ProgramDesc, copy_ctor) {
-  ProgramDescBind program;
+  ProgramDesc program;
   auto* global_block = program.MutableBlock(0);
   auto* x = global_block->Var("X");
-  x->SetType(VarDesc_VarType_LOD_TENSOR);
+  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   x->SetLoDLevel(0);
-  x->SetDataType(FP32);
+  x->SetDataType(proto::FP32);
   x->SetShape({1000, 784});
 
   auto* y = global_block->Var("Y");
-  y->SetType(VarDesc_VarType_LOD_TENSOR);
+  y->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   y->SetLoDLevel(0);
-  y->SetDataType(FP32);
+  y->SetDataType(proto::FP32);
   y->SetShape({784, 100});
 
   auto* op = global_block->AppendOp();
@@ -39,15 +39,15 @@ TEST(ProgramDesc, copy_ctor) {
   op->SetInput("Y", {y->Name()});
 
   auto* out = global_block->Var("Out");
-  out->SetType(VarDesc_VarType_LOD_TENSOR);
+  out->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   op->SetOutput("Y", {out->Name()});
 
-  ProgramDescBind program_copy(program);
+  ProgramDesc program_copy(program);
 
   auto* global_block_copy = program_copy.MutableBlock(0);
   ASSERT_NE(global_block, global_block_copy);
 
-  auto assert_same_var = [&](const std::string& name, VarDescBind* var_before) {
+  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
     ASSERT_TRUE(global_block_copy->HasVar(name));
     auto* copy = global_block_copy->Var(name);
     ASSERT_NE(copy, var_before);
@@ -81,18 +81,18 @@ TEST(ProgramDesc, copy_ctor) {
 }
 
 TEST(ProgramDescBind, serialize_and_deserialize) {
-  ProgramDescBind program_origin;
+  ProgramDesc program_origin;
   auto* global_block = program_origin.MutableBlock(0);
   auto* x = global_block->Var("X");
-  x->SetType(VarDesc_VarType_LOD_TENSOR);
+  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   x->SetLoDLevel(0);
-  x->SetDataType(FP32);
+  x->SetDataType(proto::FP32);
   x->SetShape({1000, 784});
 
   auto* y = global_block->Var("Y");
-  y->SetType(VarDesc_VarType_LOD_TENSOR);
+  y->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   y->SetLoDLevel(0);
-  y->SetDataType(FP32);
+  y->SetDataType(proto::FP32);
   y->SetShape({784, 100});
 
   auto* op = global_block->AppendOp();
@@ -101,17 +101,17 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
   op->SetInput("Y", {y->Name()});
 
   auto* out = global_block->Var("Out");
-  out->SetType(VarDesc_VarType_LOD_TENSOR);
+  out->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   op->SetOutput("Y", {out->Name()});
 
   std::string binary_str;
   program_origin.Proto()->SerializeToString(&binary_str);
 
-  ProgramDescBind program_restored(binary_str);
+  ProgramDesc program_restored(binary_str);
   auto* global_block_restored = program_restored.MutableBlock(0);
   ASSERT_NE(global_block, global_block_restored);
 
-  auto assert_same_var = [&](const std::string& name, VarDescBind* var_before) {
+  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
     ASSERT_TRUE(global_block_restored->HasVar(name));
     auto* restored = global_block_restored->Var(name);
     ASSERT_NE(restored, var_before);
diff --git a/paddle/framework/prune.cc b/paddle/framework/prune.cc
index bf3066983cdcf44ae84f236ac72486e5d4fd5b92..25eb813ffb96e9b1e13299421ead9f85c02da59f 100644
--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
@@ -26,8 +26,10 @@ namespace framework {
 
 const std::string kFeedOpType = "feed";
 const std::string kFetchOpType = "fetch";
+const std::string kDropOutOpType = "dropout";
+const std::string kBatchNormOpType = "batch_norm";
 
-bool HasDependentVar(const OpDesc& op_desc,
+bool HasDependentVar(const proto::OpDesc& op_desc,
                      const std::set<std::string>& dependent_vars) {
   for (auto& var : op_desc.outputs()) {
     for (auto& argu : var.arguments()) {
@@ -39,14 +41,15 @@ bool HasDependentVar(const OpDesc& op_desc,
   return false;
 }
 
-bool IsTarget(const OpDesc& op_desc) {
+bool IsTarget(const proto::OpDesc& op_desc) {
   if (op_desc.has_is_target()) {
     return op_desc.is_target();
   }
   return false;
 }
 
-void prune_impl(const ProgramDesc& input, ProgramDesc* output, int block_id) {
+void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
+                int block_id) {
   // TODO(tonyyang-svail):
   //    - will change to use multiple blocks for RNN op and Cond Op
 
@@ -102,9 +105,31 @@ void prune_impl(const ProgramDesc& input, ProgramDesc* output, int block_id) {
 }
 
 // TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
-void Prune(const ProgramDesc& input, ProgramDesc* output) {
+void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
   prune_impl(input, output, 0);
 }
 
+void inference_optimize_impl(const proto::ProgramDesc& input,
+                             proto::ProgramDesc* output, int block_id) {
+  *output = input;
+  auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
+  for (auto& op_desc : *op_field) {
+    if (op_desc.type() == kDropOutOpType ||
+        op_desc.type() == kBatchNormOpType) {
+      for (auto& attr : *op_desc.mutable_attrs()) {
+        if (attr.name() == "is_test") {
+          attr.set_b(true);
+          break;
+        }
+      }
+    }
+  }
+}
+
+void InferenceOptimize(const proto::ProgramDesc& input,
+                       proto::ProgramDesc* output) {
+  inference_optimize_impl(input, output, 0);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/prune.h b/paddle/framework/prune.h
index 8cfb16343aa44dcc8a3349b01adecce33f1c2b5b..593292523d0c14136791bb804a4721a0740b47ba 100644
--- a/paddle/framework/prune.h
+++ b/paddle/framework/prune.h
@@ -20,7 +20,10 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void Prune(const ProgramDesc& input, ProgramDesc* output);
+void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output);
+
+void InferenceOptimize(const proto::ProgramDesc& input,
+                       proto::ProgramDesc* output);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/prune_test.cc b/paddle/framework/prune_test.cc
index 5988874809f51c09b3d3d279be6c1e8d43d7a782..d76c5abca94cb87220ce73537a8657c3ec695f4d 100644
--- a/paddle/framework/prune_test.cc
+++ b/paddle/framework/prune_test.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/prune.h"
 
@@ -29,12 +29,12 @@ namespace ops = paddle::operators;
 
 void AddOp(const std::string &type, const f::VariableNameMap &inputs,
            const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           paddle::framework::BlockDescBind *block) {
+           paddle::framework::BlockDesc *block) {
   // insert output
   for (auto kv : outputs) {
     for (auto v : kv.second) {
       auto var = block->Var(v);
-      var->SetDataType(paddle::framework::DataType::FP32);
+      var->SetDataType(paddle::framework::proto::DataType::FP32);
     }
   }
 
@@ -51,88 +51,102 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
 }
 
 TEST(Prune, one_operator) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
-  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, {}, block);
+  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
+        block);
 
-  f::ProgramDesc *pdesc = program.Proto();
-  f::ProgramDesc pruned;
+  f::proto::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc pruned;
 
-  Prune(*pdesc, &pruned);
+  f::Prune(*pdesc, &pruned);
   PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 0);
 
   pdesc->mutable_blocks(0)->mutable_ops(0)->set_is_target(true);
-  Prune(*pdesc, &pruned);
+  f::Prune(*pdesc, &pruned);
   PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 1);
 }
 
 TEST(Prune, forward) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
-  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, {}, block);
-  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, {}, block);
-  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, {}, block);
-  AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, {}, block);
+  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, f::AttributeMap{},
+        block);
 
-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
 
   for (int i = 0; i < pdesc->blocks(0).ops_size(); ++i) {
-    f::ProgramDesc pruned;
+    f::proto::ProgramDesc pruned;
     pdesc->mutable_blocks(0)->mutable_ops(i)->set_is_target(true);
-    Prune(*pdesc, &pruned);
+    f::Prune(*pdesc, &pruned);
     PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), i + 1);
   }
 }
 
 TEST(Prune, multi_input_op) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
-  AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, {}, block);
-  AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, {}, block);
-  AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, {}, block);
-  AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}}, {},
+  AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
         block);
+  AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, f::AttributeMap{},
+        block);
+  AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}},
+        f::AttributeMap{}, block);
 
-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
   pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true);
 
-  f::ProgramDesc pruned;
-  Prune(*pdesc, &pruned);
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
   PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 4);
 }
 
 TEST(Prune, multi_output_op) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
-  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, {}, block);
-  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, {}, block);
-  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, {}, block);
+  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
+        f::AttributeMap{}, block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
+        block);
 
-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
   pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
 
-  f::ProgramDesc pruned;
-  Prune(*pdesc, &pruned);
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
   PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 2);
 }
 
 TEST(Prune, multi_target) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
-  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, {}, block);
-  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, {}, block);
-  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, {}, block);
+  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
+        f::AttributeMap{}, block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
+        block);
 
-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
   pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
   pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
 
-  f::ProgramDesc pruned;
-  Prune(*pdesc, &pruned);
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
   PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 3);
 }
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index 9ad6272c99dd6a85520ae44c1331ac232bc6a9a2..2bd0ac8f5a9eb6439a4196dd9c61e13797c1a8e3 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>  // for unique_ptr
 #include <mutex>   // for call_once
 #include "glog/logging.h"
+#include "paddle/framework/threadpool.h"
 #include "paddle/string/printf.h"
 
 namespace paddle {
@@ -36,12 +37,9 @@ Scope& Scope::NewScope() const {
 }
 
 Variable* Scope::Var(const std::string& name) {
-  auto iter = vars_.find(name);
-  if (iter != vars_.end()) {
-    VLOG(3) << "Get existing variable " << name;
-    return iter->second;
-  }
-  Variable* v = new Variable();
+  auto* v = FindVarLocally(name);
+  if (v != nullptr) return v;
+  v = new Variable();
   vars_[name] = v;
   VLOG(3) << "Create variable " << name;
   v->name_ = &(vars_.find(name)->first);
@@ -57,8 +55,10 @@ Variable* Scope::Var(std::string* name) {
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  auto it = vars_.find(name);
-  if (it != vars_.end()) return it->second;
+  auto var = FindVarLocally(name);
+  if (var != nullptr) {
+    return var;
+  }
   return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
 }
 
@@ -75,17 +75,9 @@ void Scope::DropKids() {
   kids_.clear();
 }
 
-std::vector<std::string> Scope::GetAllNames(bool recursive) const {
-  std::vector<std::string> known_vars(vars_.size());
-
-  if (recursive) {
-    for (auto& kid : kids_) {
-      auto kid_vars = kid->GetAllNames();
-      for (auto& p : kid_vars) {
-        known_vars.emplace_back(p);
-      }
-    }
-  }
+std::vector<std::string> Scope::LocalVarNames() const {
+  std::vector<std::string> known_vars;
+  known_vars.reserve(this->vars_.size());
   for (auto& p : vars_) {
     known_vars.emplace_back(p.first);
   }
@@ -96,7 +88,8 @@ void Scope::DeleteScope(Scope* scope) {
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
   this->kids_.erase(it);
-  delete scope;
+  // Make delete async.
+  Async([scope] { delete scope; });
 }
 
 void Scope::Rename(const std::string& origin_name,
@@ -117,5 +110,11 @@ std::string Scope::Rename(const std::string& origin_name) const {
   return var_name;
 }
 
+Variable* Scope::FindVarLocally(const std::string& name) const {
+  auto it = vars_.find(name);
+  if (it != vars_.end()) return it->second;
+  return nullptr;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index c2aafb6ad825f9bd9ffef754923a15afdeaa8e5c..a1da81cc7977d2f31b99c41fb3db3ec03188f954 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -66,7 +66,7 @@ class Scope {
   void DropKids();
 
   // enumerate all the variables current contains.
-  std::vector<std::string> GetAllNames(bool recursive = false) const;
+  std::vector<std::string> LocalVarNames() const;
 
   // Rename variable to a new name
   void Rename(const std::string& origin_name,
@@ -75,6 +75,8 @@ class Scope {
   // Rename variable to a new name and return the new name
   std::string Rename(const std::string& origin_name) const;
 
+  Variable* FindVarLocally(const std::string& name) const;
+
  private:
   // Call Scope::NewScope for a sub-scope.
   explicit Scope(Scope const* parent) : parent_(parent) {}
diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc
index f738d5ba9ecda57ea25bb5f84057d1d0106eef66..0f5b86061dbdebde08badca7f984f4a2c8d7bc79 100644
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
@@ -61,7 +61,7 @@ TEST(Scope, GetAllNames) {
   Variable* v = s.Var("a");
   EXPECT_EQ(&s, s.FindScope(v));
 
-  std::vector<std::string> ans = s.GetAllNames();
+  std::vector<std::string> ans = s.LocalVarNames();
   std::string str;
   for (auto& var : ans) {
     str += var;
diff --git a/paddle/framework/selected_rows.cc b/paddle/framework/selected_rows.cc
index c74459c9dd7006a24615b1d6df041583088fb25c..3b3e60177a495cc99f38ee8b82af41c4c76b8652 100644
--- a/paddle/framework/selected_rows.cc
+++ b/paddle/framework/selected_rows.cc
@@ -12,5 +12,58 @@ limitations under the License. */
 #include "paddle/framework/selected_rows.h"
 
 namespace paddle {
-namespace framework {}  // namespace framework
+namespace framework {
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
+                       const platform::DeviceContext& dev_ctx) {
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
+  }
+  {
+    // the 2st field, rows information
+    auto& rows = selected_rows.rows();
+    uint64_t size = rows.size();
+    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    for (uint64_t i = 0; i < size; ++i) {
+      os.write(reinterpret_cast<const char*>(&rows[i]), sizeof(rows[i]));
+    }
+  }
+  {
+    // the 3st field, the height of SelectedRows
+    int64_t height = selected_rows.height();
+    os.write(reinterpret_cast<const char*>(&height), sizeof(height));
+  }
+  // the 4st field, Tensor data
+  SerializeToStream(os, selected_rows.value(), dev_ctx);
+}
+
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
+                           const platform::DeviceContext& dev_ctx) {
+  {
+    // the 1st field, unit32_t version for SelectedRows
+    uint32_t version;
+    is.read(reinterpret_cast<char*>(&version), sizeof(version));
+    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  }
+  {
+    // the 2st field, rows information
+    uint64_t size;
+    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    auto& rows = *selected_rows->mutable_rows();
+    rows.resize(size);
+    for (uint64_t i = 0; i < size; ++i) {
+      is.read(reinterpret_cast<char*>(&rows[i]), sizeof(int64_t));
+    }
+  }
+  {
+    // the 3st field, the height of the SelectedRows
+    int64_t height;
+    is.read(reinterpret_cast<char*>(&height), sizeof(int64_t));
+    selected_rows->set_height(height);
+  }
+  // the 4st field, tensor which contains the data
+  DeserializeFromStream(is, selected_rows->mutable_value(), dev_ctx);
+}
+
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/selected_rows.h b/paddle/framework/selected_rows.h
index 0332b91323e3a4b4b80e02302ad3dcafe0986cde..30d3dfc1e89f073a8180ceacf77619b36f7079a9 100644
--- a/paddle/framework/selected_rows.h
+++ b/paddle/framework/selected_rows.h
@@ -59,5 +59,15 @@ class SelectedRows {
   int64_t height_;
 };
 
+/*
+ * Serialize/Desiralize SelectedRows to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
+                       const platform::DeviceContext& dev_ctx);
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
+                           const platform::DeviceContext& dev_ctx);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/selected_rows_test.cc b/paddle/framework/selected_rows_test.cc
index 4ee13a65d72e44693573397bb686b355effb2227..8ff3fb6a97199a2798ab29c56957a0f77fa26628 100644
--- a/paddle/framework/selected_rows_test.cc
+++ b/paddle/framework/selected_rows_test.cc
@@ -43,5 +43,21 @@ TEST_F(SelectedRowsTester, complete_dims) {
   ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100}));
 }
 
+TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
+  SelectedRows dst_tensor;
+  platform::CPUDeviceContext cpu_ctx(place_);
+  std::ostringstream oss;
+
+  SerializeToStream(oss, *selected_rows_, cpu_ctx);
+
+  std::istringstream iss(oss.str());
+  DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
+
+  ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows());
+  ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
+  ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
+  ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc
index 0af41b164f5894db17b2f86d4eba371cf05e3b41..e53cc0cdabc623ae358f1a3e21823a2f38ec3c62 100644
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -1,17 +1,19 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include "paddle/framework/shape_inference.h"
+#include "grad_op_desc_maker.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace framework {
@@ -22,6 +24,12 @@ std::vector<framework::DDim> InferShapeContext::GetInputsDim(
   return GetDims(names);
 }
 
+DDim InferShapeContext::GetInputsElementDim(const std::string &name,
+                                            int idx) const {
+  const std::vector<std::string> &names = Inputs(name);
+  return this->GetDim(names[idx]);
+}
+
 void InferShapeContext::SetOutputsDim(
     const std::string &name, const std::vector<framework::DDim> &dims) {
   auto &names = Outputs(name);
@@ -43,20 +51,23 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
   size_t length = names.size();
   PADDLE_ENFORCE_EQ(length, dims.size());
   for (size_t i = 0; i < length; ++i) {
+    if (names[i] == framework::kEmptyVarName) {
+      continue;
+    }
     SetDim(names[i], dims[i]);
   }
 }
-std::vector<VarDesc::VarType> InferShapeContext::GetInputsVarType(
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetInputsVarType(
     const std::string &name) const {
   return GetVarTypes(Inputs(name));
 }
-std::vector<VarDesc::VarType> InferShapeContext::GetOutputsVarType(
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetOutputsVarType(
     const std::string &name) const {
   return GetVarTypes(Outputs(name));
 }
-std::vector<VarDesc::VarType> InferShapeContext::GetVarTypes(
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetVarTypes(
     const std::vector<std::string> &names) const {
-  std::vector<VarDesc::VarType> retv;
+  std::vector<proto::VarDesc::VarType> retv;
   retv.resize(names.size());
   std::transform(names.begin(), names.end(), retv.begin(),
                  std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index 05dc47f06ac81f0acb6d0317cbecb3009c7dd7f0..f93319d8f2fd4c5d388bd57fd595a6a5edd51775 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -27,8 +27,9 @@ class InferShapeContext {
   virtual bool HasInput(const std::string &name) const = 0;
   virtual bool HasOutput(const std::string &name) const = 0;
 
-  std::vector<VarDesc::VarType> GetInputsVarType(const std::string &name) const;
-  std::vector<VarDesc::VarType> GetOutputsVarType(
+  std::vector<proto::VarDesc::VarType> GetInputsVarType(
+      const std::string &name) const;
+  std::vector<proto::VarDesc::VarType> GetOutputsVarType(
       const std::string &name) const;
 
   virtual bool HasInputs(const std::string &name) const = 0;
@@ -37,6 +38,7 @@ class InferShapeContext {
   virtual framework::DDim GetInputDim(const std::string &name) const = 0;
 
   std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
+  DDim GetInputsElementDim(const std::string &name, int idx) const;
 
   virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
   void SetOutputsDim(const std::string &name,
@@ -64,10 +66,10 @@ class InferShapeContext {
   std::vector<framework::DDim> GetDims(
       const std::vector<std::string> &names) const;
 
-  std::vector<VarDesc::VarType> GetVarTypes(
+  std::vector<proto::VarDesc::VarType> GetVarTypes(
       const std::vector<std::string> &names) const;
 
-  virtual VarDesc::VarType GetVarType(const std::string &name) const = 0;
+  virtual proto::VarDesc::VarType GetVarType(const std::string &name) const = 0;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/tensor.cc b/paddle/framework/tensor.cc
index ea7b2a1f7b17d9abc2c2e14de5ecd1cf4a1a5027..f922e606249849e621e679f71d6dbe0f007c8464 100644
--- a/paddle/framework/tensor.cc
+++ b/paddle/framework/tensor.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/tensor.h"
 
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 28d0fcf94ec31c82476e093f93ccee222a0c9d9a..4aaa29d794c95592832a1fe990e2dce274eba9d5 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -20,12 +20,12 @@ limitations under the License. */
 #include <typeindex>
 #include <vector>
 
+#include "paddle/framework/data_layout.h"
 #include "paddle/framework/ddim.h"
 #include "paddle/memory/memory.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
-#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 
@@ -55,6 +55,10 @@ class Tensor {
   template <typename T>
   inline const T* data() const;
 
+  inline bool IsInitialized() const;
+
+  inline void switch_place(platform::Place new_place);
+
   /**
    * @brief   Return a pointer to mutable memory block.
    * @note    If not exist, then allocation.
@@ -89,34 +93,6 @@ class Tensor {
   /*! The internal of two tensors share the same memory block. */
   inline Tensor& ShareDataWith(const Tensor& src);
 
-  /**
-   * @brief   Copy the content of external tensor to a new place.
-   *
-   * @param[in] src        The external tensor.
-   * @param[in] dst_place  The dst place.
-   * @param[in] ctx        The device context contains device resources.
-   *
-   * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
-   */
-  // TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647
-  // Remove `CopyFrom` and `CopyFromVector` from Tensor interface
-  // and make them global functions
-  inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
-                       const platform::DeviceContext& ctx);
-
-  /**
-   * @brief   Copy the content of an external vector to a tensor.
-   *
-   * @param[in] src        The external tensor.
-   * @param[in] ctx        The device context contains device resources.
-   *
-   * * @note    CopyFromVector assumes that the tensor has been resized
-   *            before invoking.
-   */
-  template <typename T>
-  inline void CopyFromVector(const std::vector<T>& src,
-                             const platform::DeviceContext& ctx);
-
   /**
    * @brief  Return a sub-tensor of the given tensor.
    *
@@ -141,9 +117,12 @@ class Tensor {
 
   size_t memory_size() const;
 
- private:
   inline void check_memory_size() const;
 
+  inline DataLayout layout() const { return layout_; }
+
+  inline void set_layout(const DataLayout layout) { layout_ = layout; }
+
  private:
   friend class LoDTensor;
 
@@ -202,6 +181,19 @@ class Tensor {
 
   DDim dims_;
 
+  /**
+   * @brief the layout of memory block, default is NHWC.
+   *
+   * @note the memory allocation order, describe how weight/data is stored
+   *       For example, in 4-D Tensor(rank=4), there are three commonly
+   *       used layout. They are
+   *            NCHW, NHWC, CHWN.
+   *       N,C,H,W for respectively the batch size, the number of
+   *       feature maps, the height.
+   */
+
+  DataLayout layout_ = DataLayout::kNHWC;
+
   /**
    * @brief   A PlaceHolder may be shared by more than one tensor.
    *
@@ -212,6 +204,15 @@ class Tensor {
   size_t offset_;
 };
 
+inline void Tensor::switch_place(platform::Place new_place) {
+  if (holder_->place() == new_place) {
+    return;
+  }
+
+  // TODO(tonyyang-svail): do memcpy here.
+  PADDLE_THROW("Not Implemented");
+}
+
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/framework/tensor.md b/paddle/framework/tensor.md
index 7a80816d8e4ffa3a9462f3d9b87eff0f048466aa..0a27ac9bb6b03649d42e12100fda9e80a56e7f56 100644
--- a/paddle/framework/tensor.md
+++ b/paddle/framework/tensor.md
@@ -71,7 +71,7 @@ private:
 ```
 
 ```c++
-typedef boost::variant<GpuPlace, CpuPlace> Place;
+typedef boost::variant<CUDAPlace, CpuPlace> Place;
 typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>,
                        Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar;
 typedef boost::variant<
diff --git a/paddle/framework/tensor_array.cc b/paddle/framework/tensor_array.cc
deleted file mode 100644
index 0947e33548130a923e998f8bad68db00097af909..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor_array.cc
+++ /dev/null
@@ -1,444 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-
-
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/framework/tensor_array.h"
-
-#include <glog/logging.h>
-#include <algorithm>
-#include <limits>
-
-#include "paddle/framework/eigen.h"
-
-namespace paddle {
-namespace framework {
-
-namespace detail {
-
-/*
- * Offer an iterator over the length-sorted lod-tensor's top level. The top
- * level of a lod-tensor stores batch-size of sequences, each top-level sequence
- * may contains several lower-level sequences, sort top-level lod by the numbers
- * of lower-level sequences in descending order, so that during RNN's running,
- * the batch-size will keep decreasing, the short sentences will end at the tail
- * of each batch.
- *
- * Let's take a simple lod-tensor for example
- *
- *   |(0)       |(1)        top-level has two instances
- *   |||        |||||    lower-level
- *
- * sort by lower-level's length
- *
- *   |(1)       |(0)
- *   |||||      |||
- *
- * when RNN runs, it get 5 batches (equals the number of elements the longest
- * sequence has)
- *
- * |||||
- * |||
- *
- * the first three batches has two elements, the last two elements just has 1
- * element each.
- */
-struct DynamicBatchUnpacker {
-  using value_type = float;
-
-  DynamicBatchUnpacker(const LoDTensor& source, size_t level,
-                       bool descend = true)
-      : source(&source), level(level) {
-    BuildLengthSortedMeta(descend);
-  }
-
-  LoDTensor GetBatch(size_t index);
-
-  std::vector<DySeqMeta> meta;
-
-  LoDTensor const* source;
-  size_t level;
-
- protected:
-  void BuildLengthSortedMeta(bool descend);
-};
-
-LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
-                           const std::vector<DySeqMeta>& meta, const LoD& lod,
-                           size_t level);
-
-std::vector<size_t> GenDyBatchIndice(const DySeqMetaBatch& meta, int batch_id) {
-  // collect indice need to copy to the batch
-  std::vector<size_t> indice;
-  for (const auto& seq : meta) {
-    size_t id = seq.begin + batch_id;
-    if (id >= seq.end) break;
-    indice.push_back(id);
-  }
-  return indice;
-}
-
-}  // namespace detail
-
-const LoDTensor& TensorArray::Read(size_t index) const {
-  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
-  if (index >= size()) {
-    values_.resize(index + 1);
-  }
-  return values_[index];
-}
-
-void TensorArray::Write(size_t index, const LoDTensor& value) {
-  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
-
-  if (index >= size()) {
-    values_.resize(index + 1);
-  }
-
-  values_[index].set_lod(value.lod());
-  values_[index].Resize(value.dims());
-  values_[index].mutable_data<value_type>(value.place());
-  values_[index].CopyFrom(value, value.place(), platform::CPUDeviceContext());
-}
-
-void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
-  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
-  if (index >= size()) {
-    values_.resize(index + 1);
-  }
-
-  values_[index].set_lod(value.lod());
-  values_[index].ShareDataWith(value);
-}
-
-LoDTensor TensorArray::Pack(size_t level, const std::vector<DySeqMeta>& meta,
-                            const LoD& lod) const {
-  return detail::PackDynamicBatch(values_, meta, lod, level);
-}
-
-DySeqMetaBatch TensorArray::Unpack(const LoDTensor& source, int level,
-                                   bool length_desend) {
-  detail::DynamicBatchUnpacker unpacker(source, level,
-                                        length_desend /*descend*/);
-
-  // find max length of all the sequences
-  size_t max_length = 0;
-  for (const auto& seq : unpacker.meta) {
-    max_length = std::max(max_length, seq.end - seq.begin);
-  }
-
-  // write batches to values
-  for (size_t batch_id = 0; batch_id < max_length; batch_id++) {
-    Write(batch_id, unpacker.GetBatch(batch_id));
-  }
-
-  PADDLE_ENFORCE(!unpacker.meta.empty());
-  return unpacker.meta;
-}
-
-LoDTensor TensorArray::LodPack(size_t level) const {
-  PADDLE_ENFORCE_GT(size(), 0UL, "no time step exists");
-  // the levels should be no less than 2
-  LoDTensor merged;
-  const LoDTensor *pre, *cur;
-  pre = &Read(0);
-
-  for (size_t step = 1; step < size(); step++) {
-    cur = &Read(step);
-    PADDLE_ENFORCE_GT(cur->NumLevels(), 0);
-    PADDLE_ENFORCE_GT(pre->NumLevels(), 0);
-    PADDLE_ENFORCE_EQ(pre->NumLevels(), cur->NumLevels());
-    PADDLE_ENFORCE_EQ(pre->NumElements(level), cur->NumElements(level));
-
-    merged = LodPackTwo(*pre, *cur, level);
-    pre = &merged;
-  }
-  return merged;
-}
-
-/*
- * NOTE currently, only the lowest level supports packing.
- * The lowest LoD will be changed, while the relative offsets in levels above
- * stay unchanged.
- *
- * previous step : [0] [1] [3]
- * current step: [0 1 2] [2 3] []
- * packed to
- *   [0 0] [0 1] [0 2] [1 2] [1 3] [3]
- */
-LoDTensor TensorArray::LodPackTwo(const LoDTensor& pre, const LoDTensor& cur,
-                                  size_t level) const {
-  PADDLE_ENFORCE_EQ(pre.NumLevels(), cur.NumLevels());
-  PADDLE_ENFORCE_EQ(pre.NumLevels(), level + 1,
-                    "Only the lowest LoD level supports pack temporarily.");
-  // calculate the result tensor's shape first
-  size_t num_instances = 0;
-  for (size_t elem = 0; elem < pre.NumElements(level); elem++) {
-    size_t prefix_size = pre.NumElements(level, elem);
-    size_t num_candidates = cur.NumElements(level, elem);
-    if (num_candidates > 0) {
-      num_instances += num_candidates * (prefix_size + 1);
-    } else {
-      num_instances += prefix_size;
-    }
-  }
-
-  auto res_dims = pre.dims();
-  res_dims[0] = num_instances;
-  LoDTensor result;
-  result.Resize(res_dims);
-  result.mutable_data<value_type>(cur.place());
-
-  Vector<size_t> last_lod_level;
-  // copy data
-  size_t index = 0;
-  last_lod_level.push_back(index);
-  for (size_t elem = 0; elem < pre.NumElements(level); elem++) {
-    size_t prefix_size = pre.NumElements(level, elem);
-    size_t num_candidates = cur.NumElements(level, elem);
-
-    // slice the prefix Tensor
-    LoDTensor prefix = pre;
-    prefix.ShrinkInLevel(level, elem, elem + 1);
-    LoDTensor candidate = cur;
-    if (num_candidates > 0) {
-      candidate.ShrinkInLevel(level, elem, elem + 1);
-    } else {  // just push prefix
-      result.Slice(index, index + prefix_size)
-          .CopyFrom(prefix, result.place(), platform::CPUDeviceContext());
-      index += prefix_size;
-      last_lod_level.push_back(index);
-    }
-    for (size_t candi = 0; candi < num_candidates; candi++) {
-      // TODO(superjom) support GPU
-      result.Slice(index, index + prefix_size)
-          .CopyFrom(prefix, result.place(), platform::CPUDeviceContext());
-      index += prefix_size;
-      // copy candidate record
-      result.Slice(index, index + 1)
-          .CopyFrom(candidate.Slice(candi, candi + 1), result.place(),
-                    platform::CPUDeviceContext());
-      index++;
-      last_lod_level.push_back(index);
-    }
-  }
-
-  // update lod
-  auto lod = cur.lod();
-  lod.back() = last_lod_level;
-  result.set_lod(lod);
-  return result;
-}
-
-/*
- * source [0 1 2] [3 4] [5 6 7] will be transformd to a list of LoDTensors such
- * as
- * [0 3 5] [1 4 6] [2 7] with 1-level LoDs:
- * - [0 1 2 3]
- * - [0 1 2 3]
- * - [0 1 1 2], the [1,1) here means the second sequence is empty
- *
- * NOTE Unpack a LoDTensor in this approach may result in a big LoD.
- */
-void TensorArray::LodUnpack(const LoDTensor& source, size_t level) {
-  PADDLE_ENFORCE_EQ(level, source.NumLevels() - 1,
-                    "only the lowest LoD level supports unpack.");
-  const size_t non_empty_instances = source.dims()[0];
-  size_t index = 0;
-  Vector<size_t> lowest_lod_level;
-  lowest_lod_level.push_back(index);
-
-  for (size_t step = 0; step < non_empty_instances; step++) {
-    size_t num_instances = 0;
-    for (size_t id = 0; id < source.NumElements(level); id++) {
-      auto instance = source;
-      instance.ShrinkInLevel(level, id, id + 1);
-      if (static_cast<size_t>(instance.dims()[0]) > step) {
-        num_instances++;
-        index++;
-      }
-      lowest_lod_level.push_back(index);
-    }
-
-    // create tensor for this time step
-    LoDTensor tensor;
-    auto dims = source.dims();
-    dims[0] = num_instances;
-    // set lod
-    auto lod = source.lod();
-    lod.back() = lowest_lod_level;
-    tensor.set_lod(lod);
-
-    index = 0;
-    for (size_t id = 0; id < source.NumElements(level); id++) {
-      auto instance = source;
-      instance.ShrinkInLevel(level, id, id + 1);
-      if (static_cast<size_t>(instance.dims()[0]) > step) {
-        // copy this instance
-        tensor.Slice(index, index + 1)
-            .CopyFrom(instance.Slice(step, step + 1), tensor.place(),
-                      platform::CPUDeviceContext());
-        index++;
-      }
-    }
-    Write(step, tensor);
-  }
-}
-
-LoDTensor TensorArray::Stack() const {
-  LoDTensor result;
-  if (size() == 0) return result;
-
-  const auto& first_dims = values_.front().dims();
-  // check all the values have the same shape
-  // TODO(superjom) check the same dtypes
-  for (size_t idx = 1; idx < size(); idx++) {
-    const auto& value_dims = values_[idx].dims();
-    PADDLE_ENFORCE_EQ(first_dims, value_dims);
-  }
-
-  // copy
-  auto result_dims = vectorize(first_dims);
-  result_dims.insert(result_dims.begin(), size());
-  result.Resize(make_ddim(result_dims));
-  result.mutable_data<value_type>(platform::CPUPlace());
-
-  for (size_t idx = 0; idx < size(); idx++) {
-    result.Slice(idx, idx + 1)
-        .CopyFrom(Read(idx), platform::CPUPlace(),
-                  platform::CPUDeviceContext());
-  }
-  return result;
-}
-
-void TensorArray::Unstack(const LoDTensor& source) const {
-  Unstack(source, false /*data_shared*/);
-}
-
-void TensorArray::UnstackShared(const LoDTensor& source) const {
-  Unstack(source, true /*data_shared*/);
-}
-
-void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const {
-  size_t first_dim = source.dims()[0];
-  DDim value_dims = slice_ddim(source.dims(), 1, source.dims().size());
-  PADDLE_ENFORCE_GT(first_dim, 0,
-                    "source should have some data to be unstacked");
-
-  values_.resize(first_dim);
-
-  for (size_t elem = 0; elem < first_dim; elem++) {
-    // create a new value
-    auto& value = values_[elem];
-    if (data_shared) {
-      // share memory
-      value.ShareDataWith(source.Slice(elem, elem + 1));
-    } else {
-      // copy
-      value.Resize(value_dims);
-      value.CopyFrom(source.Slice(elem, elem + 1), platform::CPUPlace(),
-                     platform::CPUDeviceContext());
-    }
-  }
-}
-
-size_t TensorArray::size() const { return values_.size(); }
-
-namespace detail {
-
-void DynamicBatchUnpacker::BuildLengthSortedMeta(bool descend) {
-  PADDLE_ENFORCE(meta.empty(), "duplicate build meta");
-  // collect meta for each sequence in some level
-  auto lod = SliceLevels(source->lod(), level, level + 1)[0];
-
-  for (size_t seq_id = 0; seq_id < lod.size() - 1; seq_id++) {
-    DySeqMeta seq_meta({lod[seq_id], lod[seq_id + 1], seq_id});
-    meta.push_back(seq_meta);
-  }
-
-  PADDLE_ENFORCE_GT(meta.size(), 0, "meta is empty");
-
-  // sort by length
-  sort(meta.begin(), meta.end(),
-       [descend](const DySeqMeta& a, const DySeqMeta& b) {
-         bool a_ge_b = (a.end - a.begin) > (b.end - b.begin);
-         return descend ? a_ge_b : !a_ge_b;
-       });
-}
-
-LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
-  PADDLE_ENFORCE(!meta.empty(), "should build meta first");
-  LoDTensor result;
-
-  auto indice = detail::GenDyBatchIndice(meta, index);
-  PADDLE_ENFORCE(!indice.empty(), "invalid batch at %d", index);
-
-  // copy the indice of records in LoDTensor
-  auto record_dims = slice_ddim(source->dims(), 1, source->dims().size());
-  auto record_dims_vec = vectorize(record_dims);
-  record_dims_vec.insert(record_dims_vec.begin(), indice.size());
-  result.Resize(make_ddim(record_dims_vec));
-  result.mutable_data<value_type>(platform::CPUPlace());
-
-  for (size_t i = 0; i < indice.size(); i++) {
-    auto index = indice[i];
-    auto target = result.Slice(i, i + 1);
-    auto slice = source->Slice(index, index + 1);
-
-    target.CopyFrom(slice, platform::CPUPlace(), platform::CPUDeviceContext());
-  }
-
-  return result;
-}
-
-// TODO(supejom) to cache lod if reasonable
-LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
-                           const std::vector<DySeqMeta>& meta, const LoD& lod,
-                           size_t level) {
-  PADDLE_ENFORCE(!source.empty());
-  PADDLE_ENFORCE(!meta.empty());
-  PADDLE_ENFORCE(!lod.empty());
-
-  LoDTensor result;
-
-  // init result space
-  auto record_dims = slice_ddim(source[0].dims(), 1, source[0].dims().size());
-  auto record_dims_vec = vectorize(record_dims);
-  auto height = lod[level].back();
-  record_dims_vec.insert(record_dims_vec.begin(), height);
-  result.Resize(make_ddim(record_dims_vec));
-  result.mutable_data<float>(platform::CPUPlace());
-
-  for (size_t batch_id = 0; batch_id < source.size(); batch_id++) {
-    for (size_t seq_id = 0; seq_id < meta.size(); seq_id++) {
-      const auto& seq_meta = meta[seq_id];
-      // source is source[batch_id][seq_id]
-      // target is result[index]
-      auto index = seq_meta.begin + batch_id;
-      if (index >= seq_meta.end) break;
-      auto source_ = source[batch_id].Slice(seq_id, seq_id + 1);
-      auto target = result.Slice(index, index + 1);
-      target.CopyFrom(source_, platform::CPUPlace(),
-                      platform::CPUDeviceContext());
-    }
-  }
-
-  result.set_lod(lod);
-  return result;
-}
-
-}  // namespace detail
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor_array.h b/paddle/framework/tensor_array.h
deleted file mode 100644
index 78fad8cab7e27a7f07ca542c2a083460ee9e2b79..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor_array.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <vector>
-
-#include "paddle/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-
-/*
- * DyBatchSeqPosition stores indices of the basic element in tensor. It is used
- * after lod-tensor's re-assembling, its info can be used to recover the order
- * in original lod-tensor.
- */
-struct DySeqMeta {
-  DySeqMeta(size_t begin, size_t end, size_t ori_idx)
-      : begin(begin), end(end), ori_idx(ori_idx) {}
-
-  size_t begin;
-  size_t end;  // not included
-  size_t ori_idx;
-};
-
-using DySeqMetaBatch = std::vector<DySeqMeta>;
-
-/*
- * Extract the indices of instances.
- */
-std::vector<size_t> GenDyBatchIndice(const DySeqMetaBatch &metas, int batch_id);
-
-/*
- * TensorArray is a C-array-like array of tensors, it is meant to be used with
- * dynamic iteration primitives such as while_loop. It is used to segment inputs
- * and store states in all time steps.
- *
- * By providing some methods similar to a C++ array, the difinition of some
- * state-based dynamic models such as RNN cound be more natural and highly
- * flexible.
- */
-class TensorArray {
- public:
-  using value_type = float;
-
-  // max number of values allowed to store.
-  const size_t MAX_SIZE{100000};
-
-  /*
-   * Read the value at location `index` in the `TensorArray`.
-   */
-  const LoDTensor &Read(size_t index) const;
-
-  /*
-   * Write value into the index of the TensorArray.
-   */
-  void Write(size_t index, const LoDTensor &value);
-
-  /*
-   * Write value into the index of the TensorArray, with memory shared.
-   */
-  void WriteShared(size_t index, const LoDTensor &value);
-
-  /*
-   * Recover the original LoD-arranged LoDTensor with the `values`, `level` and
-   * `indice_map`.
-   */
-  LoDTensor Pack(size_t level, const DySeqMetaBatch &meta,
-                 const LoD &lod) const;
-
-  /*
-   * Split LoDTensor in some `level` and write the generated batches to
-   * `values`, if set `desend`, will sort by length in descending order else in
-   * ascending order.
-   */
-  DySeqMetaBatch Unpack(const LoDTensor &source, int level, bool length_desend);
-
-  /*
-   * Pack an array of LoDTensors to a LoDTensor.
-   */
-  LoDTensor LodPack(size_t level) const;
-
-  /*
-   * Unpack a LoDTensor to an array of LoDTensors.
-   */
-  void LodUnpack(const LoDTensor &source, size_t level);
-
-  /*
-   * Pack the values into a tensor with rank one higher than each tensor in
-   * values.
-   */
-  LoDTensor Stack() const;
-
-  /*
-   * Unstacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors.
-   */
-  void Unstack(const LoDTensor &source) const;
-
-  /*
-   * Unstacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors,
-   * with memory of tensors shared.
-   */
-  void UnstackShared(const LoDTensor &source) const;
-
-  /*
-   * Return the number of values.
-   */
-  size_t size() const;
-
- protected:
-  void Unstack(const LoDTensor &source, bool data_shared) const;
-
-  LoDTensor LodPackTwo(const LoDTensor &pre, const LoDTensor &cur,
-                       size_t level) const;
-
- private:
-  mutable std::vector<LoDTensor> values_;
-};  // class TensorArray
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor_array_test.cc b/paddle/framework/tensor_array_test.cc
deleted file mode 100644
index 83b52b442daf9b2f1fc40f23e458fcb67c5040e8..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor_array_test.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/framework/tensor_array.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace framework {
-
-class TensorArrayTester : public ::testing::Test {
- protected:
-  void SetUp() override {
-    LoDTensor source;
-    source.Resize(make_ddim({batch_size, dim}));
-    int* data = source.mutable_data<int>(platform::CPUPlace());
-    for (int i = 0; i < 16 * 32; i++) {
-      data[i] = i;
-    }
-    ta.Unstack(source);
-  }
-
-  TensorArray ta;
-  const int batch_size = 16;
-  const int dim = 32;
-};
-
-TEST_F(TensorArrayTester, Read) {
-  for (int i = 0; i < batch_size; i++) {
-    const auto& tensor = ta.Read(i);
-    ASSERT_EQ(tensor.dims()[0], 1);
-    ASSERT_EQ(tensor.dims()[1], dim);
-  }
-}
-
-TEST_F(TensorArrayTester, Write) {
-  LoDTensor source;
-  source.Resize(make_ddim({1, dim}));
-  for (int i = 0; i < dim; i++) {
-    *(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
-  }
-
-  ta.Write(2, source);
-
-  const auto& tensor = ta.Read(2);
-  for (int i = 0; i < dim; i++) {
-    EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
-  }
-}
-
-TEST_F(TensorArrayTester, WriteShared) {
-  LoDTensor source;
-  source.Resize(make_ddim({1, dim}));
-  for (int i = 0; i < dim; i++) {
-    *(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
-  }
-
-  ta.WriteShared(2, source);
-
-  const auto& tensor = ta.Read(2);
-  for (int i = 0; i < dim; i++) {
-    EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
-  }
-
-  EXPECT_EQ(source.data<int>(), tensor.data<int>());
-}
-
-class TensorArrayPackTester : public ::testing::Test {
- protected:
-  virtual void SetUp() override {
-    lod.push_back(std::vector<size_t>{0, 2, 9, 13});
-
-    source.set_lod(lod);
-    source.Resize(make_ddim({13, 128}));
-    source.mutable_data<int>(platform::CPUPlace());
-
-    // content of each setence: 0 1 2 3 4
-    const auto& level = lod.front();
-    for (size_t i = 0; i < level.size() - 1; i++) {
-      size_t begin = level[i];
-      size_t end = level[i + 1];
-      for (size_t j = begin; j < end; j++) {
-        auto record = source.Slice(j, j + 1);
-        for (int dim = 0; dim < 128; dim++) {
-          record.mutable_data<int>(platform::CPUPlace())[dim] = j - begin;
-        }
-      }
-    }
-
-    // unpack
-    meta = ta.Unpack(source, 0, true);
-  }
-
-  LoD lod;
-  TensorArray ta;
-  LoDTensor source;
-  std::vector<DySeqMeta> meta;
-};
-
-TEST_F(TensorArrayPackTester, Unpack) {
-  ASSERT_EQ(ta.size(), 7UL);
-
-  const auto& t0 = ta.Read(0);
-  const auto& t1 = ta.Read(1);
-
-  ASSERT_EQ(t0.data<int>()[0], int(0));
-  ASSERT_EQ(t1.data<int>()[0], int(1));
-}
-
-TEST_F(TensorArrayPackTester, Pack) {
-  LoDTensor packed = ta.Pack(0, meta, lod);
-}
-
-TEST_F(TensorArrayTester, size) {
-  ASSERT_EQ(ta.size(), static_cast<size_t>(batch_size));
-}
-
-TEST(TensorArray, LodPack) {
-  // three time steps, each step stores a LoDTensors
-  // - [0] [1]
-  // - [2 3], [4 5]
-  // - [6 7] [] [8], [9, 10]
-  // try to get a LoDTensor with content:
-  // - [0 2 6]
-  // - [0 2 7]
-  // - [0 3]
-  // - [1 4 8]
-  // - [1 5 9]
-  // - [1 5 10]
-  std::array<LoDTensor, 3> tensors;
-  tensors[0].Resize(make_ddim({2, 1}));
-  tensors[1].Resize(make_ddim({4, 1}));
-  tensors[2].Resize(make_ddim({5, 1}));
-  int index = 0;
-  for (auto& t : tensors) {
-    t.mutable_data<int>(platform::CPUPlace());
-    for (int i = 0; i < t.dims()[0]; i++) {
-      t.data<int>()[i] = index;
-      index++;
-    }
-  }
-
-  std::array<LoD, 3> lods;
-  std::vector<std::vector<size_t>> levels{
-      {0, 1, 2}, {0, 2, 4}, {0, 2, 2, 3, 5}};
-  for (int i = 0; i < 3; i++) {
-    lods[i].emplace_back(levels[i].begin(), levels[i].end());
-  }
-
-  TensorArray ta;
-  for (int i = 0; i < 3; i++) {
-    tensors[i].set_lod(lods[i]);
-    ta.Write(i, tensors[i]);
-  }
-
-  auto merged = ta.LodPack(0);
-
-  std::vector<int> target_tensor_data{{0, 2, 6,  // 0
-                                       0, 2, 7,  // 1
-                                       0, 3,     // 2
-                                       1, 4, 8,  // 3
-                                       1, 5, 9,  // 5
-                                       1, 5, 10}};
-  EXPECT_EQ(merged.dims()[0], (int)target_tensor_data.size());
-  for (size_t i = 0; i < target_tensor_data.size(); i++) {
-    EXPECT_EQ(target_tensor_data[i], merged.data<int>()[i]);
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 7e88e039611007d17156d10f852eb46f3ee8e7a3..1340c5e48520ccdd537e694abf452fd79129df99 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -84,6 +84,8 @@ inline const T* Tensor::data() const {
       reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
 }
 
+inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
+
 template <typename T>
 inline T* Tensor::data() {
   check_memory_size();
@@ -125,11 +127,11 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
           boost::get<platform::CPUPlace>(place), size, type));
     } else if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
+      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
     }
 #else
-      holder_.reset(new PlaceholderImpl<platform::GPUPlace>(
-          boost::get<platform::GPUPlace>(place), size, type));
+      holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
+          boost::get<platform::CUDAPlace>(place), size, type));
     }
 #endif
     offset_ = 0;
@@ -150,84 +152,6 @@ inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
   return *this;
 }
 
-inline void Tensor::CopyFrom(const Tensor& src,
-                             const platform::Place& dst_place,
-                             const platform::DeviceContext& ctx) {
-  src.check_memory_size();
-  Resize(src.dims());
-
-  auto src_place = src.holder_->place();
-  auto src_ptr = src.data<void>();
-
-  auto dst_ptr = mutable_data(dst_place, src.type());
-
-  auto size = src.numel() * SizeOfType(src.type());
-
-  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
-                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(src_place) &&
-           platform::is_cpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
-    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  } else if (platform::is_gpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  }
-#endif
-}
-
-template <typename T>
-inline void Tensor::CopyFromVector(const std::vector<T>& src,
-                                   const platform::DeviceContext& ctx) {
-  auto dst_place = ctx.GetPlace();
-  auto src_ptr = static_cast<const void*>(src.data());
-  platform::CPUPlace src_place;
-  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
-  auto size = src.size() * sizeof(T);
-
-  if (platform::is_cpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, src_place,
-                 src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(dst_place)) {
-    memory::Copy(
-        boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  }
-#endif
-}
-
 inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
   check_memory_size();
   PADDLE_ENFORCE_GE(begin_idx, 0,
@@ -243,6 +167,7 @@ inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
     size_t base = numel() / dims_[0];
     Tensor dst;
     dst.holder_ = holder_;
+    dst.set_layout(layout_);
     DDim dst_dims = dims_;
     dst_dims[0] = end_idx - begin_idx;
     dst.Resize(dst_dims);
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 1bb0fb71b079940d35a995b78e04a531c074a8b2..c04cd38697f85b1d51aaa42b34a189f22bb4d5cc 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 /*
   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,12 +28,13 @@
 #include <gtest/gtest.h>
 #include <string>
 
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+
 TEST(Tensor, Dims) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  Tensor tt;
+  framework::Tensor tt;
   tt.Resize({2, 3, 4});
-  DDim dims = tt.dims();
+  framework::DDim dims = tt.dims();
   ASSERT_EQ(arity(dims), 3);
   for (int i = 0; i < 3; ++i) {
     EXPECT_EQ(i + 2, dims[i]);
@@ -28,12 +42,12 @@ TEST(Tensor, Dims) {
 }
 
 TEST(Tensor, DataAssert) {
-  paddle::framework::Tensor src_tensor;
+  framework::Tensor src_tensor;
 
   bool caught = false;
   try {
     src_tensor.data<double>();
-  } catch (paddle::platform::EnforceNotMet err) {
+  } catch (platform::EnforceNotMet err) {
     caught = true;
     std::string msg =
         "holder_ should not be null\nTensor holds no memory. Call "
@@ -50,61 +64,65 @@ TEST(Tensor, DataAssert) {
    because Memory::Alloc() and Memory::Free() have not been ready.
 */
 TEST(Tensor, MutableData) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
-    Tensor src_tensor;
+    framework::Tensor src_tensor;
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
-    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
+                                        platform::CPUPlace());
     EXPECT_NE(p1, nullptr);
     // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CPUPlace());
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
+                                        platform::CPUPlace());
     EXPECT_NE(p2, nullptr);
     EXPECT_NE(p1, p2);
     // set src_tensor a new dim with same size
     // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CPUPlace());
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
+                                        platform::CPUPlace());
     EXPECT_EQ(p1, p2);
     // set src_tensor a new dim with smaller size
     // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
+                                        platform::CPUPlace());
     EXPECT_EQ(p1, p2);
   }
 
 #ifdef PADDLE_WITH_CUDA
   {
-    Tensor src_tensor;
+    framework::Tensor src_tensor;
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
-    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
+                                        platform::CUDAPlace());
     EXPECT_NE(p1, nullptr);
     // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace());
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
+                                        platform::CUDAPlace());
     EXPECT_NE(p2, nullptr);
     EXPECT_NE(p1, p2);
     // set src_tensor a new dim with same size
     // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace());
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
+                                        platform::CUDAPlace());
     EXPECT_EQ(p1, p2);
     // set src_tensor a new dim with smaller size
     // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
+                                        platform::CUDAPlace());
     EXPECT_EQ(p1, p2);
   }
 #endif
 }
 
 TEST(Tensor, ShareDataWith) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
-    Tensor src_tensor;
-    Tensor dst_tensor;
+    framework::Tensor src_tensor;
+    framework::Tensor dst_tensor;
     // Try to share data form uninitialized tensor
     bool caught = false;
     try {
@@ -121,16 +139,18 @@ TEST(Tensor, ShareDataWith) {
     }
     ASSERT_TRUE(caught);
 
-    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
+    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
+                                 platform::CPUPlace());
     dst_tensor.ShareDataWith(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
 #ifdef PADDLE_WITH_CUDA
   {
-    Tensor src_tensor;
-    Tensor dst_tensor;
-    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
+    framework::Tensor src_tensor;
+    framework::Tensor dst_tensor;
+    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
+                                 platform::CUDAPlace());
     dst_tensor.ShareDataWith(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
@@ -138,13 +158,12 @@ TEST(Tensor, ShareDataWith) {
 }
 
 TEST(Tensor, Slice) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
-    Tensor src_tensor;
-    src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
-    Tensor slice_tensor = src_tensor.Slice(1, 3);
-    DDim slice_dims = slice_tensor.dims();
+    framework::Tensor src_tensor;
+    src_tensor.mutable_data<int>(framework::make_ddim({5, 3, 4}),
+                                 platform::CPUPlace());
+    framework::Tensor slice_tensor = src_tensor.Slice(1, 3);
+    framework::DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 3);
     EXPECT_EQ(slice_dims[0], 2);
     EXPECT_EQ(slice_dims[1], 3);
@@ -153,11 +172,12 @@ TEST(Tensor, Slice) {
     uintptr_t src_data_address =
         reinterpret_cast<uintptr_t>(src_tensor.data<int>());
     uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
-        src_tensor.mutable_data<int>(src_tensor.dims(), CPUPlace()));
+        src_tensor.mutable_data<int>(src_tensor.dims(), platform::CPUPlace()));
     uintptr_t slice_data_address =
         reinterpret_cast<uintptr_t>(slice_tensor.data<int>());
-    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
-        slice_tensor.mutable_data<int>(slice_tensor.dims(), CPUPlace()));
+    uintptr_t slice_mutable_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<int>(
+            slice_tensor.dims(), platform::CPUPlace()));
     EXPECT_EQ(src_data_address, src_mutable_data_address);
     EXPECT_EQ(slice_data_address, slice_mutable_data_address);
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
@@ -165,22 +185,25 @@ TEST(Tensor, Slice) {
 
 #ifdef PADDLE_WITH_CUDA
   {
-    Tensor src_tensor;
-    src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
-    Tensor slice_tensor = src_tensor.Slice(2, 6);
-    DDim slice_dims = slice_tensor.dims();
+    framework::Tensor src_tensor;
+    src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
+                                    platform::CUDAPlace());
+    framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
+    framework::DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 2);
     EXPECT_EQ(slice_dims[0], 4);
     EXPECT_EQ(slice_dims[1], 9);
 
     uintptr_t src_data_address =
         reinterpret_cast<uintptr_t>(src_tensor.data<double>());
-    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
-        src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace()));
+    uintptr_t src_mutable_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
+            src_tensor.dims(), platform::CUDAPlace()));
     uintptr_t slice_data_address =
         reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
-    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
-        slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace()));
+    uintptr_t slice_mutable_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
+            slice_tensor.dims(), platform::CUDAPlace()));
     EXPECT_EQ(src_data_address, src_mutable_data_address);
     EXPECT_EQ(slice_data_address, slice_mutable_data_address);
     EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
@@ -188,187 +211,20 @@ TEST(Tensor, Slice) {
 #endif
 }
 
-TEST(Tensor, CopyFrom) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  {
-    Tensor src_tensor;
-    Tensor dst_tensor;
-    CPUDeviceContext cpu_ctx((CPUPlace()));
-
-    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
-
-    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    memcpy(src_ptr, arr, 9 * sizeof(int));
-
-    auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom(src_tensor, *cpu_place, cpu_ctx);
-
-    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    Tensor slice_tensor = src_tensor.Slice(1, 2);
-    dst_tensor.CopyFrom(slice_tensor, *cpu_place, cpu_ctx);
-    const int* slice_ptr = slice_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(dst_ptr, slice_ptr);
-    for (size_t i = 0; i < 3; ++i) {
-      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
-    }
-  }
-#ifdef PADDLE_WITH_CUDA
-  {
-    Tensor src_tensor;
-    Tensor gpu_tensor;
-    Tensor dst_tensor;
-
-    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
-
-    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    memcpy(src_ptr, arr, 9 * sizeof(int));
-
-    // CPU Tensor to GPU Tensor
-    auto gpu_place = new paddle::platform::GPUPlace(0);
-    CUDADeviceContext gpu_ctx(*gpu_place);
-    gpu_tensor.CopyFrom(src_tensor, *gpu_place, gpu_ctx);
-
-    // GPU Tensor to CPU Tensor
-    auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    Tensor slice_tensor = src_tensor.Slice(1, 2);
-
-    // CPU Slice Tensor to GPU Tensor
-    gpu_tensor.CopyFrom(slice_tensor, *gpu_place, gpu_ctx);
-
-    // GPU Tensor to CPU Tensor
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-
-    // Sync before Compare Slice Tensors
-    gpu_ctx.Wait();
-    const int* slice_ptr = slice_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(dst_ptr, slice_ptr);
-    for (size_t i = 0; i < 3; ++i) {
-      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
-    }
-  }
-#endif
-}
-
-TEST(Tensor, CopyFromVector) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  {
-    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
-
-    // Copy to CPU Tensor
-    cpu_tensor.Resize(make_ddim({3, 3}));
-    auto cpu_place = new paddle::platform::CPUPlace();
-    CPUDeviceContext cpu_ctx(*cpu_place);
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-
-    // Compare Tensors
-    const int* cpu_ptr = cpu_tensor.data<int>();
-    const int* src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-    }
-
-    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-    cpu_ptr = cpu_tensor.data<int>();
-    src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    for (size_t i = 0; i < 5; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-    }
-
-    delete cpu_place;
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
-    Tensor gpu_tensor;
-    Tensor dst_tensor;
-
-    // Copy to CPU Tensor
-    cpu_tensor.Resize(make_ddim({3, 3}));
-    auto cpu_place = new paddle::platform::CPUPlace();
-    CPUDeviceContext cpu_ctx(*cpu_place);
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-
-    // Copy to GPUTensor
-    gpu_tensor.Resize(make_ddim({3, 3}));
-    auto gpu_place = new paddle::platform::GPUPlace();
-    CUDADeviceContext gpu_ctx(*gpu_place);
-    gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
-    // Copy from GPU to CPU tensor for comparison
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    const int* src_ptr = src_vec.data();
-    const int* cpu_ptr = cpu_tensor.data<int>();
-    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-    gpu_tensor.Resize(make_ddim({2, 2}));
-    gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    src_ptr = src_vec.data();
-    cpu_ptr = cpu_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 5; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    delete cpu_place;
-    delete gpu_place;
-  }
-#endif
-}
-
 TEST(Tensor, ReshapeToMatrix) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  Tensor src;
-  int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, CPUPlace());
+  framework::Tensor src;
+  int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, platform::CPUPlace());
   for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
     src_ptr[i] = i;
   }
-  Tensor res = ReshapeToMatrix(src, 2);
+  framework::Tensor res = framework::ReshapeToMatrix(src, 2);
   ASSERT_EQ(res.dims()[0], 2 * 3);
   ASSERT_EQ(res.dims()[1], 4 * 9);
 }
+
+TEST(Tensor, Layout) {
+  framework::Tensor src;
+  ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC);
+  src.set_layout(framework::DataLayout::kAnyLayout);
+  ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
+}
diff --git a/paddle/framework/tensor_util.cc b/paddle/framework/tensor_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5b83eaa07ad25d39996f5644d6a7f3ed35ff7b2
--- /dev/null
+++ b/paddle/framework/tensor_util.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/tensor_util.h"
+
+namespace paddle {
+namespace framework {
+template <typename Predicate, typename DevCtx>
+struct AnyDTypeVisitor {
+  Predicate predicate_;
+  const Tensor& tensor_;
+  const DevCtx& ctx_;
+  Tensor* out_;
+
+  AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
+                  Tensor* out)
+      : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
+
+  template <typename T>
+  void operator()() const {
+    auto t = EigenVector<T>::Flatten(tensor_);
+    auto o = EigenScalar<bool>::From(*out_);
+    // return any of predicate_(t) is true.
+    o.device(*ctx_.eigen_device()) = predicate_(t).any();
+  }
+};
+
+template <typename Predicate, typename DevCtx>
+inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
+                    const DevCtx& ctx, framework::Tensor* out) {
+  VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor<Predicate, DevCtx>(
+                                               predicate, tensor, ctx, out));
+}
+
+template <typename Predicate>
+struct AnyVisitor : public boost::static_visitor<bool> {
+  const framework::Tensor& tensor_;
+  Predicate predicate_;
+
+  AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
+      : tensor_(tensor), predicate_(std::move(predicate)) {}
+
+  template <typename Place>
+  bool operator()(const Place& place) const {
+    framework::Tensor out;
+    out.Resize({1});
+    out.mutable_data<bool>(place);
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+    AnyImpl(predicate_, tensor_, *ctx, &out);
+    return this->GetResult(out, place);
+  }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CUDAPlace& gpu) const {
+    platform::CPUPlace cpu;
+    framework::Tensor tmp;
+    tmp.Resize({1});
+    tmp.mutable_data<bool>(cpu);
+    auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
+    gpuctx->Wait();
+    Copy(out, cpu, *gpuctx, &tmp);
+    gpuctx->Wait();
+    return GetResult(tmp, cpu);
+  }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CPUPlace& cpu) const {
+    return *out.data<bool>();
+  }
+};
+
+template <typename Predicate>
+inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
+  AnyVisitor<Predicate> visitor(tensor, predicate);
+  auto place = tensor.place();
+  return platform::VisitPlace(place, visitor);
+}
+
+struct HasNANPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isnan()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isnan();
+  }
+};
+
+bool HasNAN(const framework::Tensor& tensor) {
+  HasNANPredicate predicate;
+  return Any(tensor, predicate);
+}
+
+struct HasInfPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isinf()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isinf();
+  }
+};
+
+bool HasInf(const framework::Tensor& tensor) {
+  HasInfPredicate predicate;
+  return Any(tensor, predicate);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_util.cu b/paddle/framework/tensor_util.cu
new file mode 120000
index 0000000000000000000000000000000000000000..b00e6e59d93328bf3142597ea4de0dc225501e56
--- /dev/null
+++ b/paddle/framework/tensor_util.cu
@@ -0,0 +1 @@
+./tensor_util.cc
\ No newline at end of file
diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..b49c61449984f51d65963958c87191b0799bcf5b
--- /dev/null
+++ b/paddle/framework/tensor_util.h
@@ -0,0 +1,333 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+/**
+ * @brief   Copy the content of external tensor to a new place.
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] dst_place  The dst place.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * @note    Copy supports CPU <-> GPU, GPU <-> GPU.
+ */
+inline void Copy(const Tensor& src, const platform::Place& dst_place,
+                 const platform::DeviceContext& ctx, Tensor* dst) {
+  VLOG(3) << "Copy " << src.dims() << " from " << src.place() << " to "
+          << dst_place;
+  src.check_memory_size();
+
+  dst->Resize(src.dims());
+  dst->set_layout(src.layout());
+  auto src_place = src.place();
+  auto src_ptr = src.data<void>();
+
+  auto dst_ptr = dst->mutable_data(dst_place, src.type());
+
+  auto size = src.numel() * SizeOfType(src.type());
+
+  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
+    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  } else if (platform::is_gpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+/**
+ * @brief Wrapper on
+ *     Copy(const Tensor& src, const platform::Place& dst_place,
+ *              const platform::DeviceContext& ctx, Tensor* dst);
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] dst_place  The dst place.
+ *
+ * @note    Copy supports CPU <-> GPU, GPU <-> GPU.
+ */
+inline void Copy(const Tensor& src, const platform::Place& dst_place,
+                 Tensor* dst) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext* dev_ctx;
+  if (platform::is_gpu_place(src.place())) {
+    dev_ctx = pool.Get(src.place());
+  } else {
+    dev_ctx = pool.Get(dst_place);
+  }
+  Copy(src, dst_place, *dev_ctx, dst);
+}
+
+/**
+ * @brief   Copy the content of an external vector to a tensor.
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * * @note    CopyFromVector will resize dst to an 1D tensor with the same
+ *            size as src.
+ */
+template <typename T>
+inline void CopyFromVector(const std::vector<T>& src,
+                           const platform::DeviceContext& ctx, Tensor* dst) {
+  auto dst_place = ctx.GetPlace();
+  auto src_ptr = static_cast<const void*>(src.data());
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>(dst_place));
+  auto size = src.size() * sizeof(T);
+
+  if (platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, src_place,
+                 src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
+    memory::Copy(
+        boost::get<platform::CUDAPlace>(dst_place), dst_ptr, src_place, src_ptr,
+        size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+/**
+ * @brief CopyFromVector CPU vector -> CPU Tensor
+ */
+template <typename T>
+inline void CopyFromVector(const std::vector<T>& src, Tensor* dst) {
+  platform::CPUPlace dst_place = platform::CPUPlace();
+  auto src_ptr = static_cast<const void*>(src.data());
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>(dst_place));
+  auto size = src.size() * sizeof(T);
+
+  memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+}
+
+/**
+ * @brief   Copy the content of a tensor to a vector
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * * @note    CopyFromVector assumes that the tensor has been resized
+ *            before invoking.
+ */
+template <typename T>
+inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
+                         std::vector<T>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+  auto size = src.numel() * sizeof(T);
+
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(dst->data());
+
+  if (platform::is_cpu_place(src.place())) {
+    memory::Copy(dst_place, dst_ptr,
+                 boost::get<platform::CPUPlace>(src.place()), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src.place())) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, boost::get<platform::CUDAPlace>(src.place()),
+        src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+/**
+ * @brief CopyToVector CPUTensor <-> CPU Vector
+ */
+template <typename T>
+inline void CopyToVector(const Tensor& src, std::vector<T>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+  auto size = src.numel() * sizeof(T);
+
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(dst->data());
+
+  PADDLE_ENFORCE(platform::is_cpu_place(src.place()));
+
+  memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()),
+               src_ptr, size);
+}
+
+// Returns true if a tensor contains NAN, i.e., Not A Number.
+bool HasNAN(const framework::Tensor& tensor);
+
+// Returns true if a tensor contains Inf, i.e., Infinity.
+bool HasInf(const framework::Tensor& tensor);
+
+inline void SerializeToStream(std::ostream& os, const Tensor& tensor,
+                              const platform::DeviceContext& dev_ctx) {
+  // TODO(typhoonzero): serialize to ostream
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
+  }
+  {  // the 2nd field, tensor description
+     // int32_t  size
+     // void*    protobuf message
+    proto::TensorDesc desc;
+    desc.set_data_type(framework::ToDataType(tensor.type()));
+    auto dims = framework::vectorize(tensor.dims());
+    auto* pb_dims = desc.mutable_dims();
+    pb_dims->Resize(static_cast<int>(dims.size()), 0);
+    std::copy(dims.begin(), dims.end(), pb_dims->begin());
+    int32_t size = desc.ByteSize();
+    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    auto out = desc.SerializeAsString();
+    os.write(out.data(), size);
+  }
+  {  // the 3rd field, tensor data
+    uint64_t size = tensor.memory_size();
+    auto* data_ptr = tensor.data<void>();
+    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
+                   "Index overflow when writing tensor");
+    if (platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& gpu_dev_ctx =
+          static_cast<const platform::CUDADeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     boost::get<platform::CUDAPlace>(tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write,
+                     gpu_dev_ctx.stream());
+        gpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      os.write(static_cast<const char*>(data_ptr),
+               static_cast<std::streamsize>(size));
+    }
+  }
+}
+
+struct DeserializedDataFunctor {
+  DeserializedDataFunctor(void** buf, Tensor* tensor,
+                          const platform::Place& place)
+      : buf_(buf), tensor_(tensor), place_(place) {}
+
+  template <typename T>
+  void operator()() {
+    *buf_ = tensor_->mutable_data<T>(place_);
+  }
+
+  void** buf_;
+  Tensor* tensor_;
+  platform::Place place_;
+};
+
+inline void DeserializeFromStream(std::istream& is, Tensor* tensor,
+                                  const platform::DeviceContext& dev_ctx) {
+  uint32_t version;
+  is.read(reinterpret_cast<char*>(&version), sizeof(version));
+  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  proto::TensorDesc desc;
+  {  // int32_t size
+     // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char*>(buf.get()), size);
+    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+                   "Cannot parse tensor desc");
+  }
+  {  // read tensor
+    std::vector<int64_t> dims;
+    dims.reserve(static_cast<size_t>(desc.dims().size()));
+    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+    tensor->Resize(framework::make_ddim(dims));
+    void* buf;
+    auto ctx = platform::CPUDeviceContext();
+    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+      Tensor cpu_tensor;
+      cpu_tensor.Resize(framework::make_ddim(dims));
+      framework::VisitDataType(
+          desc.data_type(),
+          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
+      is.read(static_cast<char*>(buf), cpu_tensor.memory_size());
+      auto dst_place = dev_ctx.GetPlace();
+      framework::Copy(cpu_tensor, dst_place, dev_ctx, tensor);
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      framework::VisitDataType(
+          desc.data_type(),
+          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
+      is.read(static_cast<char*>(buf), tensor->memory_size());
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f541927c0e7e044adeabee01e8ec91e7d8ef7baf
--- /dev/null
+++ b/paddle/framework/tensor_util_test.cc
@@ -0,0 +1,321 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/framework/tensor_util.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <string>
+
+namespace paddle {
+namespace framework {
+
+TEST(Copy, Tensor) {
+  Tensor src_tensor;
+  Tensor dst_tensor;
+  platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
+
+  int* src_ptr =
+      src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
+
+  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  memcpy(src_ptr, arr, 9 * sizeof(int));
+  src_tensor.set_layout(DataLayout::kAnyLayout);
+
+  auto cpu_place = new platform::CPUPlace();
+  Copy(src_tensor, *cpu_place, &dst_tensor);
+
+  const int* dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+
+  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
+
+  Tensor slice_tensor = src_tensor.Slice(1, 2);
+  Copy(slice_tensor, *cpu_place, &dst_tensor);
+  const int* slice_ptr = slice_tensor.data<int>();
+  dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(dst_ptr, slice_ptr);
+  for (size_t i = 0; i < 3; ++i) {
+    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+  }
+  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    Tensor src_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    int* src_ptr =
+        src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
+
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
+
+    // CPU Tensor to GPU Tensor
+    auto gpu_place = new platform::CUDAPlace(0);
+    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+
+    // GPU Tensor to CPU Tensor
+    auto cpu_place = new platform::CPUPlace();
+    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    Tensor slice_tensor = src_tensor.Slice(1, 2);
+
+    // CPU Slice Tensor to GPU Tensor
+    Copy(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+
+    // GPU Tensor to CPU Tensor
+    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Slice Tensors
+    gpu_ctx.Wait();
+    const int* slice_ptr = slice_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
+
+    EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
+  }
+#endif
+}
+
+TEST(CopyFromVector, Tensor) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    CopyFromVector<int>(src_vec, &cpu_tensor);
+
+    // Compare Tensors
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, &cpu_tensor);
+    cpu_ptr = cpu_tensor.data<int>();
+    src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+
+    delete cpu_place;
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    CPUDeviceContext cpu_ctx(*cpu_place);
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+
+    // Copy to GPUTensor
+    gpu_tensor.Resize(make_ddim({3, 3}));
+    auto gpu_place = new paddle::platform::CUDAPlace();
+    CUDADeviceContext gpu_ctx(*gpu_place);
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    // Copy from GPU to CPU tensor for comparison
+    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    const int* src_ptr = src_vec.data();
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    gpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    src_ptr = src_vec.data();
+    cpu_ptr = cpu_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    delete cpu_place;
+    delete gpu_place;
+  }
+#endif
+}
+
+TEST(CopyToVector, Tensor) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    Tensor src;
+    int* src_ptr = src.mutable_data<int>({3, 3}, CPUPlace());
+    for (int i = 0; i < 3 * 3; ++i) {
+      src_ptr[i] = i;
+    }
+
+    CPUPlace place;
+    std::vector<int> dst;
+    CopyToVector<int>(src, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_ptr[i], dst[i]);
+    }
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor gpu_tensor;
+    CUDAPlace place;
+    CUDADeviceContext gpu_ctx(place);
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+
+    std::vector<int> dst;
+    CopyToVector<int>(gpu_tensor, gpu_ctx, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_vec[i], dst[i]);
+    }
+  }
+#endif
+}
+
+TEST(HasNAN, CPU) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor src;
+  float* buf = src.mutable_data<float>({3}, CPUPlace());
+  buf[0] = 0.0;
+  buf[1] = NAN;
+  buf[2] = 0.0;
+
+  ASSERT_TRUE(HasNAN(src));
+}
+
+TEST(HasInf, CPU) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor src;
+  double* buf = src.mutable_data<double>({3}, CPUPlace());
+  buf[0] = 1.0;
+  buf[1] = INFINITY;
+  buf[2] = 0.0;
+  ASSERT_TRUE(HasInf(src));
+}
+
+TEST(Tensor, SerializeAndDeserialize) {
+  framework::Tensor src_tensor;
+  int array[6] = {1, 2, 3, 4, 5, 6};
+  src_tensor.Resize({2, 3});
+  int* src_ptr = src_tensor.mutable_data<int>(platform::CPUPlace());
+  for (int i = 0; i < 6; ++i) {
+    src_ptr[i] = array[i];
+  }
+  {
+    framework::Tensor dst_tensor;
+    auto place = new platform::CPUPlace();
+    platform::CPUDeviceContext cpu_ctx(*place);
+    std::ostringstream oss;
+    SerializeToStream(oss, src_tensor, cpu_ctx);
+
+    std::istringstream iss(oss.str());
+    DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
+    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
+    for (int i = 0; i < 5; ++i) {
+      ASSERT_EQ(dst_ptr[i], array[i]);
+    }
+    ASSERT_EQ(dst_tensor.dims(), src_tensor.dims());
+    delete place;
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    Tensor gpu_tensor;
+    gpu_tensor.Resize({2, 3});
+    Tensor dst_tensor;
+
+    auto gpu_place = new platform::CUDAPlace();
+    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+
+    Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+
+    std::ostringstream oss;
+    SerializeToStream(oss, gpu_tensor, gpu_ctx);
+
+    std::istringstream iss(oss.str());
+    DeserializeFromStream(iss, &dst_tensor, gpu_ctx);
+
+    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
+    for (int i = 0; i < 6; ++i) {
+      ASSERT_EQ(dst_ptr[i], array[i]);
+    }
+    delete gpu_place;
+  }
+#endif
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_util_test.cu b/paddle/framework/tensor_util_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ebd35fdf6c2a1388fec23057070f723c8ef9da9c
--- /dev/null
+++ b/paddle/framework/tensor_util_test.cu
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/framework/tensor_util.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+static __global__ void FillNAN(float* buf) {
+  buf[0] = 0.0;
+  buf[1] = 0.1;
+  buf[2] = NAN;
+}
+static __global__ void FillInf(float* buf) {
+  buf[0] = 0.0;
+  buf[1] = INFINITY;
+  buf[2] = 0.5;
+}
+
+TEST(HasNAN, GPU) {
+  Tensor tensor;
+  platform::CUDAPlace gpu(0);
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  float* buf = tensor.mutable_data<float>({3}, gpu);
+  FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+  cuda_ctx->Wait();
+  ASSERT_TRUE(HasNAN(tensor));
+}
+
+TEST(HasInf, GPU) {
+  Tensor tensor;
+  platform::CUDAPlace gpu(0);
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  float* buf = tensor.mutable_data<float>({3}, gpu);
+  FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+  cuda_ctx->Wait();
+  ASSERT_TRUE(HasInf(tensor));
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/threadpool.cc b/paddle/framework/threadpool.cc
new file mode 100644
index 0000000000000000000000000000000000000000..109a7e7dc440d91e8223f2c0924f489f54a06f64
--- /dev/null
+++ b/paddle/framework/threadpool.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/threadpool.h"
+
+namespace paddle {
+namespace framework {
+
+std::unique_ptr<ThreadPool> ThreadPool::threadpool(nullptr);
+std::once_flag ThreadPool::init_flag;
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ac345851c38557f82698786dd3bc8e1202a4256
--- /dev/null
+++ b/paddle/framework/threadpool.h
@@ -0,0 +1,171 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <mutex>
+#include <queue>
+#include <thread>
+
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+class ThreadPool {
+ public:
+  typedef std::packaged_task<void()> Task;
+
+  /**
+   * @brief   Get a instance of threadpool, the thread number will
+   *          be specified as the number of hardware thread contexts
+   */
+  static ThreadPool* GetInstance() {
+    std::call_once(init_flag, &ThreadPool::Init);
+    return threadpool.get();
+  }
+
+  ~ThreadPool() {
+    {
+      // notify all threads to stop running
+      running_ = false;
+      scheduled_.notify_all();
+    }
+
+    for (auto& t : threads_) {
+      t->join();
+      t.reset(nullptr);
+    }
+  }
+
+  int GetNumThreads() const { return num_threads_; }
+
+  int GetAvailable() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return available_;
+  }
+
+  /**
+   * @brief   Push a function to the queue, and will be scheduled and
+   *          executed if a thread is available.
+   * @param[in] Task, will be pushed to the task queue.
+   * @return    std::future<void>, we could wait for the task finished by
+   *            f.wait().
+   */
+  template <typename Callback>
+  std::future<void> Run(Callback fn) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    Task task(std::bind(fn));
+    std::future<void> f = task.get_future();
+    tasks_.push(std::move(task));
+    lock.unlock();
+    scheduled_.notify_one();
+    return f;
+  }
+
+  /**
+   * @brief   Wait until all the tasks are completed.
+   */
+  void Wait() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    completed_.wait(lock, [=] { return Done() == true; });
+  }
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(ThreadPool);
+
+  explicit ThreadPool(int num_threads)
+      : num_threads_(num_threads), available_(num_threads), running_(true) {
+    threads_.resize(num_threads);
+    for (auto& thread : threads_) {
+      // TODO(Yancey1989): binding the thread on the specify CPU number
+      thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
+    }
+  }
+
+  /**
+   * @brief   If the task queue is empty and avaialbe
+   *          is equal to the number of threads, means that
+   *          all tasks are completed.
+   *
+   *          Note: this function is not thread-safe.
+   *
+   * @return true if all tasks are completed.
+   */
+  bool Done() { return tasks_.empty() && available_ == num_threads_; }
+
+  void TaskLoop() {
+    while (running_) {
+      std::unique_lock<std::mutex> lock(mutex_);
+      scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
+
+      if (!running_) {
+        break;
+      }
+      // pop a task from the task queue
+      auto task = std::move(tasks_.front());
+      tasks_.pop();
+
+      --available_;
+      lock.unlock();
+
+      // run the task
+      task();
+
+      {
+        std::unique_lock<std::mutex> lock(mutex_);
+        ++available_;
+        if (Done()) {
+          completed_.notify_all();
+        }
+      }
+    }
+  }
+
+  static void Init() {
+    if (threadpool.get() == nullptr) {
+      // TODO(Yancey1989): specify the max threads number
+      int num_threads = std::thread::hardware_concurrency();
+      PADDLE_ENFORCE_GT(num_threads, 0);
+      threadpool.reset(new ThreadPool(num_threads));
+    }
+  }
+
+ private:
+  static std::unique_ptr<ThreadPool> threadpool;
+  static std::once_flag init_flag;
+
+  int num_threads_;
+  int available_;
+  bool running_;
+  std::queue<Task> tasks_;
+  std::vector<std::unique_ptr<std::thread>> threads_;
+  std::mutex mutex_;
+  std::condition_variable scheduled_;
+  std::condition_variable completed_;
+};
+
+// Run a function asynchronously.
+// NOTE: The function must return void. If the function need to return a value,
+// you can use lambda to capture a value pointer.
+template <typename Callback>
+std::future<void> Async(Callback callback) {
+  return ThreadPool::GetInstance()->Run(callback);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/threadpool_test.cc b/paddle/framework/threadpool_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..50b6238cd8786be9d8cf2d5f821daadea12bd208
--- /dev/null
+++ b/paddle/framework/threadpool_test.cc
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <atomic>
+
+#include "threadpool.h"
+
+namespace framework = paddle::framework;
+
+void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
+  std::vector<std::future<void>> fs;
+  for (int i = 0; i < cnt; ++i) {
+    auto f = pool->Run([&sum]() { sum.fetch_add(1); });
+    fs.push_back(std::move(f));
+  }
+  for (auto& f : fs) {
+    f.wait();
+  }
+}
+
+TEST(ThreadPool, ConcurrentInit) {
+  framework::ThreadPool* pool;
+  int n = 50;
+  std::vector<std::thread> threads;
+  for (int i = 0; i < n; ++i) {
+    std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); });
+    threads.push_back(std::move(t));
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+TEST(ThreadPool, ConcurrentRun) {
+  framework::ThreadPool* pool = framework::ThreadPool::GetInstance();
+  std::atomic<int> sum(0);
+  std::vector<std::thread> threads;
+  int n = 50;
+  // sum = (n * (n + 1)) / 2
+  for (int i = 1; i <= n; ++i) {
+    std::thread t(do_sum, pool, std::ref(sum), i);
+    threads.push_back(std::move(t));
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  pool->Wait();
+  EXPECT_EQ(sum, ((n + 1) * n) / 2);
+}
diff --git a/paddle/framework/type_defs.h b/paddle/framework/type_defs.h
index baeb98c9bd49ec65da5931bcbe33ab788f86f3e8..d834d343759fa279a1444c6337956ffce1b9061a 100644
--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <functional>
@@ -25,11 +25,9 @@
 namespace paddle {
 namespace framework {
 class OperatorBase;
-class OpDescBind;
-class BlockDescBind;
-class BlockDesc;
+class OpDesc;
 class InferShapeContext;
-class BlockDescBind;
+class BlockDesc;
 
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 
@@ -37,7 +35,7 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using Attribute =
     boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                    std::vector<float>, std::vector<std::string>, bool,
-                   std::vector<bool>, BlockDescBind*>;
+                   std::vector<bool>, BlockDesc*>;
 
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
@@ -45,13 +43,13 @@ using OpCreator = std::function<OperatorBase*(
     const std::string& /*type*/, const VariableNameMap& /*inputs*/,
     const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
 
-using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDescBind>>(
-    const OpDescBind&, const std::unordered_set<std::string>& /*no_grad_set*/,
+using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDesc>>(
+    const OpDesc&, const std::unordered_set<std::string>& /*no_grad_set*/,
     std::unordered_map<std::string, std::string>* /*grad_to_var*/,
-    const std::vector<BlockDescBind*>& grad_block)>;
+    const std::vector<BlockDesc*>& grad_block)>;
 
-using InferVarTypeFN = std::function<void(const OpDescBind& /*op_desc*/,
-                                          BlockDescBind* /*block*/)>;
+using InferVarTypeFN =
+    std::function<void(const OpDesc& /*op_desc*/, BlockDesc* /*block*/)>;
 
 using InferShapeFN = std::function<void(InferShapeContext*)>;
 
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index 0babec29f6f4412ed29deeafe24470e86b30a636..62ab6593ef23c195e3caa2336574796ecaf35bc8 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -18,30 +18,32 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-VarDesc::VarType VarDescBind::GetType() const { return desc_.type(); }
+proto::VarDesc::VarType VarDesc::GetType() const { return desc_.type(); }
 
-void VarDescBind::SetType(VarDesc::VarType type) { desc_.set_type(type); }
+void VarDesc::SetType(proto::VarDesc::VarType type) { desc_.set_type(type); }
 
-void VarDescBind::SetShape(const std::vector<int64_t> &dims) {
+void VarDesc::SetShape(const std::vector<int64_t> &dims) {
   VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
 }
 
-void VarDescBind::SetDataType(DataType data_type) {
+void VarDesc::SetDataType(proto::DataType data_type) {
   mutable_tensor_desc()->set_data_type(data_type);
 }
 
-std::vector<int64_t> VarDescBind::Shape() const {
+std::vector<int64_t> VarDesc::Shape() const {
   return RepeatedToVector(tensor_desc().dims());
 }
 
-DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); }
+proto::DataType VarDesc::GetDataType() const {
+  return tensor_desc().data_type();
+}
 
-void VarDescBind::SetLoDLevel(int32_t lod_level) {
+void VarDesc::SetLoDLevel(int32_t lod_level) {
   switch (desc_.type()) {
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
       desc_.mutable_lod_tensor()->set_lod_level(lod_level);
       break;
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
       desc_.mutable_tensor_array()->set_lod_level(lod_level);
       break;
     default:
@@ -50,11 +52,11 @@ void VarDescBind::SetLoDLevel(int32_t lod_level) {
   }
 }
 
-int32_t VarDescBind::GetLodLevel() const {
+int32_t VarDesc::GetLoDLevel() const {
   switch (desc_.type()) {
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
       return desc_.lod_tensor().lod_level();
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.tensor_array().lod_level();
     default:
       PADDLE_THROW("Tensor type=%d does not support LoDLevel",
@@ -62,29 +64,29 @@ int32_t VarDescBind::GetLodLevel() const {
   }
 }
 
-const TensorDesc &VarDescBind::tensor_desc() const {
+const proto::TensorDesc &VarDesc::tensor_desc() const {
   PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type");
   switch (desc_.type()) {
-    case VarDesc::SELECTED_ROWS:
+    case proto::VarDesc::SELECTED_ROWS:
       return desc_.selected_rows();
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
       return desc_.lod_tensor().tensor();
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.tensor_array().tensor();
     default:
-      PADDLE_THROW("Unexpected branch.");
+      PADDLE_THROW("The type of var %s is unsupported.", this->Name());
   }
 }
 
-TensorDesc *VarDescBind::mutable_tensor_desc() {
+proto::TensorDesc *VarDesc::mutable_tensor_desc() {
   PADDLE_ENFORCE(desc_.has_type(),
                  "invoke MutableTensorDesc must after set type");
   switch (desc_.type()) {
-    case VarDesc::SELECTED_ROWS:
+    case proto::VarDesc::SELECTED_ROWS:
       return desc_.mutable_selected_rows();
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
       return desc_.mutable_lod_tensor()->mutable_tensor();
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.mutable_tensor_array()->mutable_tensor();
     default:
       PADDLE_THROW("Unexpected branch.");
diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h
index 5cf4608944c5011d798fbde060002a57be8f6102..fc482c467404a6b9dfed64c43871d91d3d10c766 100644
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -53,44 +53,44 @@ inline void VectorToRepeated(const std::vector<bool> &vec,
   }
 }
 
-class VarDescBind {
+class VarDesc {
  public:
-  explicit VarDescBind(const std::string &name) {
+  explicit VarDesc(const std::string &name) {
     desc_.set_name(name);
-    desc_.set_type(VarDesc::LOD_TENSOR);
+    desc_.set_type(proto::VarDesc::LOD_TENSOR);
   }
 
-  explicit VarDescBind(const VarDesc &desc) : desc_(desc) {}
+  explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) {}
 
-  VarDesc *Proto() { return &desc_; }
+  proto::VarDesc *Proto() { return &desc_; }
 
   std::string Name() const { return desc_.name(); }
 
   void SetShape(const std::vector<int64_t> &dims);
 
-  void SetDataType(DataType data_type);
+  void SetDataType(proto::DataType data_type);
 
   std::vector<int64_t> Shape() const;
 
-  DataType GetDataType() const;
+  proto::DataType GetDataType() const;
 
   void SetLoDLevel(int32_t lod_level);
 
-  int32_t GetLodLevel() const;
+  int32_t GetLoDLevel() const;
 
-  VarDesc::VarType GetType() const;
+  proto::VarDesc::VarType GetType() const;
 
-  void SetType(VarDesc::VarType type);
+  void SetType(proto::VarDesc::VarType type);
 
   bool Persistable() const { return desc_.persistable(); }
 
   void SetPersistable(bool persistable) { desc_.set_persistable(persistable); }
 
  private:
-  const TensorDesc &tensor_desc() const;
-  TensorDesc *mutable_tensor_desc();
+  const proto::TensorDesc &tensor_desc() const;
+  proto::TensorDesc *mutable_tensor_desc();
 
-  VarDesc desc_;
+  proto::VarDesc desc_;
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/var_type.h b/paddle/framework/var_type.h
index 0f19870bec3e69d07278507cc556a86bbd25d12d..5b7a08a08732a6ccbc206f6a4f0aa4788ce4a219 100644
--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
@@ -1,52 +1,54 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/framework/variable.h"
 
 namespace paddle {
 namespace framework {
-inline VarDesc::VarType ToVarType(std::type_index type) {
+inline proto::VarDesc::VarType ToVarType(std::type_index type) {
   if (type.hash_code() == typeid(LoDTensor).hash_code()) {
-    return VarDesc_VarType_LOD_TENSOR;
+    return proto::VarDesc_VarType_LOD_TENSOR;
   } else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
-    return VarDesc_VarType_LOD_RANK_TABLE;
+    return proto::VarDesc_VarType_LOD_RANK_TABLE;
   } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
-    return VarDesc_VarType_LOD_TENSOR_ARRAY;
+    return proto::VarDesc_VarType_LOD_TENSOR_ARRAY;
   } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
-    return VarDesc_VarType_SELECTED_ROWS;
+    return proto::VarDesc_VarType_SELECTED_ROWS;
   } else {
     PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
   }
 }
 
 template <typename Visitor>
-inline void VisitVarType(const Variable& var, Visitor visitor) {
+inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
   switch (ToVarType(var.Type())) {
-    case VarDesc_VarType_LOD_TENSOR:
+    case proto::VarDesc_VarType_LOD_TENSOR:
       visitor(var.Get<framework::LoDTensor>());
       return;
-    case VarDesc_VarType_LOD_RANK_TABLE:
+    case proto::VarDesc_VarType_LOD_RANK_TABLE:
       visitor(var.Get<LoDRankTable>());
       return;
-    case VarDesc_VarType_LOD_TENSOR_ARRAY:
+    case proto::VarDesc_VarType_LOD_TENSOR_ARRAY:
       visitor(var.Get<LoDTensorArray>());
       return;
-    case VarDesc_VarType_SELECTED_ROWS:
+    case proto::VarDesc_VarType_SELECTED_ROWS:
       visitor(var.Get<SelectedRows>());
       return;
     default:
diff --git a/paddle/framework/var_type_inference.h b/paddle/framework/var_type_inference.h
index 32abbeb33479444c5e7a9889f4211f59af07f98f..6c11f2fee7f554fb008f559bb33aeafea5c5a556 100644
--- a/paddle/framework/var_type_inference.h
+++ b/paddle/framework/var_type_inference.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/type_defs.h"
@@ -21,8 +21,7 @@ namespace framework {
 class VarTypeInference {
  public:
   virtual ~VarTypeInference() {}
-  virtual void operator()(const OpDescBind& op_desc,
-                          BlockDescBind* block) const = 0;
+  virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/var_type_inference_test.cc b/paddle/framework/var_type_inference_test.cc
index 9035e63fa48ffdf7c72061b0a4248538d7a357e4..fa6018b1c583abaccb0259b82c9bb61c0fc10820 100644
--- a/paddle/framework/var_type_inference_test.cc
+++ b/paddle/framework/var_type_inference_test.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/var_type_inference.h"
 #include "gtest/gtest.h"
@@ -33,17 +33,16 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
 
 class SumOpVarTypeInference : public VarTypeInference {
  public:
-  void operator()(const OpDescBind &op_desc,
-                  BlockDescBind *block) const override {
+  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
     auto &inputs = op_desc.Input("X");
-    auto default_var_type = VarDesc::SELECTED_ROWS;
+    auto default_var_type = proto::VarDesc::SELECTED_ROWS;
 
     bool any_input_is_lod_tensor = std::any_of(
         inputs.begin(), inputs.end(), [block](const std::string &name) {
-          return block->Var(name)->GetType() == VarDesc::LOD_TENSOR;
+          return block->Var(name)->GetType() == proto::VarDesc::LOD_TENSOR;
         });
     if (any_input_is_lod_tensor) {
-      default_var_type = VarDesc::LOD_TENSOR;
+      default_var_type = proto::VarDesc::LOD_TENSOR;
     }
 
     auto out_var_name = op_desc.Output("Out").front();
@@ -62,43 +61,43 @@ namespace paddle {
 namespace framework {
 
 TEST(InferVarType, sum_op) {
-  ProgramDescBind prog;
+  ProgramDesc prog;
   auto *op = prog.MutableBlock(0)->AppendOp();
   op->SetType("sum");
   op->SetInput("X", {"test_a", "test_b", "test_c"});
   op->SetOutput("Out", {"test_out"});
 
-  prog.MutableBlock(0)->Var("test_a")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_b")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_c")->SetType(VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarDesc::SELECTED_ROWS);
   prog.MutableBlock(0)->Var("test_out");
 
   op->InferVarType(prog.MutableBlock(0));
 
-  ASSERT_EQ(VarDesc::SELECTED_ROWS,
+  ASSERT_EQ(proto::VarDesc::SELECTED_ROWS,
             prog.MutableBlock(0)->Var("test_out")->GetType());
 
-  prog.MutableBlock(0)->Var("test_b")->SetType(VarDesc::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::LOD_TENSOR);
   op->InferVarType(prog.MutableBlock(0));
-  ASSERT_EQ(VarDesc::LOD_TENSOR,
+  ASSERT_EQ(proto::VarDesc::LOD_TENSOR,
             prog.MutableBlock(0)->Var("test_out")->GetType());
 }
 
 TEST(InferVarType, sum_op_without_infer_var_type) {
-  ProgramDescBind prog;
+  ProgramDesc prog;
   auto *op = prog.MutableBlock(0)->AppendOp();
   op->SetType("sum_without_infer_var_type");
   op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
   op->SetOutput("Out", {"test2_out"});
 
-  prog.MutableBlock(0)->Var("test2_a")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test2_b")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test2_c")->SetType(VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarDesc::SELECTED_ROWS);
   prog.MutableBlock(0)->Var("test2_out");
 
   op->InferVarType(prog.MutableBlock(0));
 
-  ASSERT_EQ(VarDesc_VarType_LOD_TENSOR,
+  ASSERT_EQ(proto::VarDesc_VarType_LOD_TENSOR,
             prog.MutableBlock(0)->Var("test2_out")->GetType());
 }
 
diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h
index e5a94759f9230ab4ce9d2cc24849a2debb8a5e2f..03992c8608693259224901b2f9c89d458f126d09 100644
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 /*
   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
@@ -32,6 +45,8 @@ class Variable {
     return *static_cast<const T*>(holder_->Ptr());
   }
 
+  bool IsInitialized() const { return holder_ != nullptr; }
+
   template <typename T>
   T* GetMutable() {
     if (!IsType<T>()) {
diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp
index b3e666e860d29d89650d48a23cf44917035a02d7..644098a9e7873fb59b6343e805163e4892f060a8 100644
--- a/paddle/function/EigenGemm.cpp
+++ b/paddle/function/EigenGemm.cpp
@@ -21,7 +21,7 @@ template <class T>
 struct EigenBlasGemm {
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
                            Eigen::Aligned>
-      Matrix;
+      EigenMatrix;
 
   static void compute(const bool transA,
                       const bool transB,
@@ -56,14 +56,13 @@ struct EigenBlasGemm {
       sizeB[1] = N;
       CHECK_EQ(N, ldb);
     }
-    Eigen::array<int, 2> sizeC;
-    sizeC[0] = M;
-    sizeC[1] = N;
-    CHECK_EQ(N, ldc);
+    Eigen::array<int, 2> sizeC = {{M, ldc}};
+    Eigen::array<int, 2> offsetC = {{0, 0}};
+    Eigen::array<int, 2> extentC = {{M, N}};
 
-    const Matrix a(const_cast<T*>(A), sizeA);
-    const Matrix b(const_cast<T*>(B), sizeB);
-    Matrix c(C, sizeC);
+    const EigenMatrix a(const_cast<T*>(A), sizeA);
+    const EigenMatrix b(const_cast<T*>(B), sizeB);
+    EigenMatrix c(C, sizeC);
 
     typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
     Eigen::array<DimPair, 1> dims;
@@ -72,12 +71,23 @@ struct EigenBlasGemm {
     dims[0].second = transB ? 1 : 0;
 
     Eigen::DefaultDevice device;
-    if (alpha == T(1) && beta == T(0)) {
-      c.device(device) = a.contract(b, dims);
-    } else if (alpha == T(1) && beta == T(1)) {
-      c.device(device) += a.contract(b, dims);
+    if (N == ldc) {
+      if (alpha == T(1) && beta == T(0)) {
+        c.device(device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.device(device) += a.contract(b, dims);
+      } else {
+        c.device(device) = alpha * a.contract(b, dims) + beta * c;
+      }
     } else {
-      c.device(device) = alpha * a.contract(b, dims) + beta * c;
+      if (alpha == T(1) && beta == T(0)) {
+        c.slice(offsetC, extentC).device(device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.slice(offsetC, extentC).device(device) += a.contract(b, dims);
+      } else {
+        c.slice(offsetC, extentC).device(device) =
+            alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
+      }
     }
   }
 };
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index 8d34eee886a6202691e5dec2ab62e7c5b0ac7fb1..cbdbf5335d32d55a0221728758025c9d2cb3e7d1 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -129,6 +129,162 @@ public:
   }
 };
 
+#ifdef PADDLE_MOBILE_INFERENCE
+
+/*
+ * \brief Forward calculation of convolution, optimized for mobile.
+ */
+template <DeviceType Device>
+class GemmConvMobileFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // TODO(hedaoyuan): Need to define some index macros,
+    // to avoid useing 0 and 1.
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
+    size_t colWidth = outputHeight * outputWidth;
+    // Max col matrix height 256, Max col matrix width 1024
+    size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256));
+    size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048));
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+
+      resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real));
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Im2ColMobileFunctor<real> im2col;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    int nStride = colWidth;
+    int kStride = colHeight;
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
+          real beta_ = beta;
+          for (size_t colHeightStart = 0; colHeightStart < colHeight;
+               colHeightStart += stepColHeight) {
+            for (size_t colWidthStart = 0; colWidthStart < colWidth;
+                 colWidthStart += stepColWidth) {
+              int N = std::min(colWidth - colWidthStart, stepColWidth);
+              int K = std::min(colHeight - colHeightStart, stepColHeight);
+              // im2col
+              im2col(inputData + g * inputOffset,
+                     imShape,
+                     colData,
+                     colShape,
+                     strideH(),
+                     strideW(),
+                     paddingH(),
+                     paddingW(),
+                     dilationH(),
+                     dilationW(),
+                     colHeightStart,
+                     K,
+                     colWidthStart,
+                     N);
+
+              // gemm
+              int M = outputChannels / groups_;
+              BlasGemm<Device, real>::compute(
+                  false,
+                  false,
+                  M,
+                  N,
+                  K,
+                  1.0f,
+                  filterData + g * filterOffset + colHeightStart,
+                  kStride,
+                  colData,
+                  N,
+                  beta_,
+                  outputData + g * outputOffset + colWidthStart,
+                  nStride);
+            }
+            beta_ = 1.0;
+          }
+        } else {
+          int M = outputChannels / groups_;
+          int N = outputHeight * outputWidth;
+          int K = inputChannels / groups_ * filterHeight * filterWidth;
+          BlasGemm<Device, real>::compute(false,
+                                          false,
+                                          M,
+                                          N,
+                                          K,
+                                          1.0f,
+                                          filterData + g * filterOffset,
+                                          K,
+                                          inputData + g * inputOffset,
+                                          N,
+                                          beta,
+                                          outputData + g * outputOffset,
+                                          N);
+        }
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+
+    memory_.reset();
+  }
+};
+
+#endif
+
 /*
  * \brief Backward input calculation of convolution.
  */
@@ -343,7 +499,11 @@ public:
   }
 };
 
+#ifdef PADDLE_MOBILE_INFERENCE
+REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction);
+#else
 REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
+#endif
 REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
 REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index 0c37fc972484bfbede01d23652e384071bf883af..36a9bcf84e4b14965c83627821b71d1c7c0da1b2 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -98,4 +98,54 @@ public:
                   int dilationWidth = 1);
 };
 
+template <class T>
+class Im2ColMobileFunctor {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth,
+                  int colHeightStart,
+                  int colHeightSize,
+                  int colWidthStart,
+                  int colWidthSize) {
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputWidth = colShape[4];
+
+    for (int colh = 0; colh < colHeightSize; colh++) {
+      int wOffset = (colHeightStart + colh) % filterWidth;
+      int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
+      int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
+
+      for (int colw = 0; colw < colWidthSize; colw++) {
+        int h = (colWidthStart + colw) / outputWidth;
+        int w = (colWidthStart + colw) % outputWidth;
+
+        int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+        int imColIdx = w * strideWidth + wOffset * dilationWidth;
+        if ((imRowIdx - paddingHeight) < 0 ||
+            (imRowIdx - paddingHeight) >= inputHeight ||
+            (imColIdx - paddingWidth) < 0 ||
+            (imColIdx - paddingWidth) >= inputWidth) {
+          colData[colh * colWidthSize + colw] = static_cast<T>(0);
+        } else {
+          imRowIdx += c_im * inputHeight - paddingHeight;
+          imColIdx -= paddingWidth;
+          colData[colh * colWidthSize + colw] =
+              imData[imRowIdx * inputWidth + imColIdx];
+        }
+      }
+    }
+  }
+};
+
 }  // namespace paddle
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
index 1f085538d81904dbd5b5d6bcd014adaed22e37d7..3ba866dcdd845403d52f7a85adfef08cbb11c305 100644
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -138,4 +138,86 @@ TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
 
 #endif
 
+template <class T>
+void TestIm2ColMobileFunctor() {
+  for (size_t channels : {32}) {
+    for (size_t inputHeight : {33, 100}) {
+      for (size_t inputWidth : {32, 96}) {
+        for (size_t filterHeight : {5}) {
+          for (size_t filterWidth : {7}) {
+            for (size_t stride : {2}) {
+              for (size_t padding : {1}) {
+                for (size_t dilation : {1, 3}) {
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
+                  if (inputHeight + 2 * padding < filterSizeH ||
+                      inputWidth + 2 * padding < filterSizeW)
+                    break;
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
+                  size_t outputHeight =
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
+                  size_t outputWidth =
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
+
+                  TensorShape imShape =
+                      TensorShape({channels, inputHeight, inputWidth});
+                  TensorShape colShape1 = TensorShape({channels,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       outputHeight,
+                                                       outputWidth});
+
+                  size_t height = channels * filterHeight * filterWidth;
+                  size_t width = outputHeight * outputWidth;
+                  VectorPtr input1 =
+                      Vector::create(imShape.getElements(), false);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
+                  MatrixPtr output1 =
+                      Matrix::create(height, width, false, false);
+                  MatrixPtr output2 =
+                      Matrix::create(height, width, false, false);
+                  input1->uniform(0.001, 1);
+                  input2->copyFrom(*input1);
+
+                  Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> im2Col1;
+                  Im2ColMobileFunctor<T> im2Col2;
+                  im2Col1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  im2Col2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation,
+                          0,
+                          height,
+                          0,
+                          width);
+
+                  autotest::TensorCheckEqual(*output1, *output2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor<float>(); }
+
 }  // namespace paddle
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 41ead3c5ecef248830cfb0f8be360f21dcd58e7b..3d6ced713f00bd72622d8aeed3967642b6774ffe 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -34,6 +34,16 @@ else()
     message(STATUS "Compile with MKLDNNLayers and MKLDNNActivations")
 endif()
 
+if(NOT WITH_MKLML)
+    file(GLOB_RECURSE MKL_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.h")
+    file(GLOB_RECURSE MKL_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.cpp")
+    list(REMOVE_ITEM GSERVER_HEADER ${MKL_HEADER})
+    list(REMOVE_ITEM GSERVER_SOURCES ${MKL_SOURCES})
+    message(STATUS "Skip compiling with MKLPackedLayers")
+else()
+    message(STATUS "Compile with MKLPackedLayers")
+endif()
+
 if(NOT WITH_GPU)
     list(REMOVE_ITEM GSERVER_HEADER
         layers/CudnnConvBaseLayer.h
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index f5a41b66bf09a4abc5ae7b64f227ca52461408f5..57c890e4884da38e2087d89dc199e20af51495ea 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/utils/ClassRegistrar.h"
 #include "paddle/utils/Logging.h"
 
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
 #include "MKLDNNActivation.h"
 #endif
 
@@ -490,7 +490,7 @@ Error __must_check backward(Argument& act) {
 END_DEFINE_ACTIVATION(log)
 
 ActivationFunction* ActivationFunction::create(const std::string& type) {
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
   if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) {
     return MKLDNNActivation::create(type);
   }
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index ebfe0573cfdbfb2ef54a29b038e8b85356cc6c27..4ab54a5022a3d30215c7557bca2e69c4d011fc5f 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -233,6 +233,13 @@ public:
     (void)numProcessed;
   }
 
+  /**
+   * @brief   Release the middle layer's output memory.
+   *
+   * @note    This function is used for memory optimization in inference.
+   */
+  virtual void releaseOutput() {}
+
 protected:
   virtual void onLoadParameter() {}
 
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index be112b41239cace3fa9b9ee97923f8c3c7a9a98f..1f2aa61b6f86eebf7b002f1e48aa56d1d14d4820 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
 #include "paddle/gserver/layers/MKLDNNLayer.h"
 #endif
 
@@ -187,6 +187,31 @@ void NeuralNetwork::init(const ModelConfig& config,
     CHECK(it != layerMap_.end());
     outputLayers_.push_back(it->second);
   }
+
+  for (const auto& layer : layers_) {
+    const auto& name = layer->getName();
+    bool isMiddleLayer = true;
+
+    // if data layer
+    for (const auto& dataLayer : dataLayers_) {
+      if (name == dataLayer->getName()) {
+        isMiddleLayer = false;
+        break;
+      }
+    }
+
+    // if output layer
+    for (const auto& dataLayer : outputLayers_) {
+      if (name == dataLayer->getName()) {
+        isMiddleLayer = false;
+        break;
+      }
+    }
+
+    if (isMiddleLayer) {
+      middleLayers_.push_back(layer);
+    }
+  }
 }
 
 void NeuralNetwork::connect(LayerPtr agentLayer,
@@ -307,7 +332,7 @@ void NeuralNetwork::backward(const UpdateCallback& callback) {
 }
 
 void NeuralNetwork::finish() {
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
   FOR_EACH_R(layer, layers_) {
     MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(*layer);
     if (dnnLayer) {
@@ -327,6 +352,13 @@ void NeuralNetwork::onPassEnd() {
   }
 }
 
+void NeuralNetwork::releaseOutput() {
+  for (auto& layer : middleLayers_) {
+    Argument& arg = layer->getOutput();
+    arg.value.reset();
+  }
+}
+
 #ifndef PADDLE_MOBILE_INFERENCE
 
 class CombinedEvaluator : public Evaluator {
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 6888380290074318fe7f94d168b2931e776dda47..968e198cf6608c47089d40fc98c6691f9e5bda5c 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -137,6 +137,13 @@ public:
   /// some finish work, like convert the weight format of MKLDNNLayers
   void finish();
 
+  /**
+   * @brief   Release the middle layer's output memory.
+   *
+   * @note    This function is used for memory optimization in inference.
+   */
+  void releaseOutput();
+
 protected:
   /**
    * The constructor of NeuralNetwork.
@@ -158,6 +165,7 @@ protected:
 
   std::vector<DataLayerPtr> dataLayers_;
   std::vector<LayerPtr> outputLayers_;
+  std::vector<LayerPtr> middleLayers_;
 
   static std::map<std::string, bool> dllInitMap;
 
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp
index bc7d1c83a48aefeb4bc6d3baa32b78aba712e58d..925af31289d0c8ca534a30a16b14bfd2df90b013 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -41,6 +41,7 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap,
     useGlobalStats_ = config_.use_global_stats();
   }
   movingAvgFraction_ = config_.moving_average_fraction();
+  epsilon_ = config_.epsilon();
 
   weight_.reset(new Weight(1, channels_, parameters_[0]));
   movingMean_.reset(new Weight(1, channels_, parameters_[1]));
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
index e721d2d267a31cae46407673b8b1281e87055608..2ac3cd9d670d0fcf9c40ad2f117d5a72479663a3 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -94,6 +94,8 @@ protected:
   bool useGlobalStats_;
   // use to compute moving mean and variance.
   real movingAvgFraction_;
+  // Epsilon is a small random noise used in batch normalization for stability.
+  real epsilon_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp
index dacff25e5927daf9c991577a71be86b160228317..25ab5cd927792d18f78bc1fa33eee4029b427cc7 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -22,8 +22,6 @@ namespace paddle {
 
 REGISTER_LAYER(batch_norm, BatchNormalizationLayer);
 
-const real BatchNormalizationLayer::EPS = 1E-5;
-
 bool BatchNormalizationLayer::init(const LayerMap& layerMap,
                                    const ParameterMap& parameterMap) {
   /* Initialize the basic parent class */
@@ -53,7 +51,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
 
   calMovingMeanAndVar();
 
-  savedInvVar_->subScalar(-EPS);
+  savedInvVar_->subScalar(-epsilon_);
   savedInvVar_->sqrt2(*savedInvVar_);
 }
 
@@ -74,7 +72,7 @@ void BatchNormalizationLayer::setMeanAndStd() {
   savedInvVar_->copyFrom(*(movingVar_->getW()));
   savedInvVar_->downClip(real(0.0));
 
-  savedInvVar_->subScalar(-EPS);
+  savedInvVar_->subScalar(-epsilon_);
   savedInvVar_->sqrt2(*savedInvVar_);
 }
 
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
index f6115801fc6b341c0718f8851617de43bdeeec09..1fdb5e2070259a14ab6f70957c9cf03f0699f734 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -39,9 +39,6 @@ public:
   void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
-  /// Epsilon value used in the batch normalization formula.
-  static const real EPS;
-
   /// Load pre-calculated mean and std.
   void setMeanAndStd();
 
diff --git a/paddle/gserver/layers/ConvTransProjection.cpp b/paddle/gserver/layers/ConvTransProjection.cpp
index 48132a3ce4cc4b50fea6d755d84d7254d2055bec..e7f081c0232d185c223fc2f48ca79dc84c7f721d 100644
--- a/paddle/gserver/layers/ConvTransProjection.cpp
+++ b/paddle/gserver/layers/ConvTransProjection.cpp
@@ -24,13 +24,13 @@ size_t ConvTransProjection::calOutputSize() {
   if (outputH_ == 0) outputH_ = configOutH_;
   if (outputW_ == 0) outputW_ = configOutW_;
   imageH_ = imageSize(outputH_,
-                      filterH_,
+                      (filterH_ - 1) * dilationH_ + 1,
                       paddingH_,
                       strideH_,
                       /* caffeMode */ true);
 
   imageW_ = imageSize(outputW_,
-                      filterW_,
+                      (filterW_ - 1) * dilationW_ + 1,
                       paddingW_,
                       strideW_,
                       /* caffeMode */ true);
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
index 49a9540c0b6e36b59ed786287ff5c4569b69a6a5..8390b55026c895b661cb514714ba92c05a7bf02e 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -21,8 +21,6 @@ namespace paddle {
 
 REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);
 
-const double CudnnBatchNormLayer::EPS = 1E-5;
-
 bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
                                const ParameterMap& parameterMap) {
   /* Initialize the basic parent class */
@@ -61,6 +59,9 @@ void CudnnBatchNormLayer::forward(PassType passType) {
   real* movingMean = movingMean_->getW()->getData();
   real* movingVar = movingVar_->getW()->getData();
 
+  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
+  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
+
   if (!useGlobalStats_) {
     REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
     real* savedMean = savedMean_->getData();
@@ -75,7 +76,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                    1.0 - movingAvgFraction_,
                                    movingMean,
                                    movingVar,
-                                   EPS,
+                                   eps_,
                                    savedMean,
                                    savedInvVar);
   } else {
@@ -90,7 +91,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                       beta,
                                       movingMean,
                                       movingVar,
-                                      EPS);
+                                      eps_);
     } else {
       // There is a limitation in cudnn library.
       // When the batch size is larger than 1024 in cuDNN v5.1,
@@ -101,7 +102,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                    beta,
                                    movingMean,
                                    movingVar,
-                                   EPS,
+                                   eps_,
                                    batchSize,
                                    channels_,
                                    imageH_ * imageD_,
@@ -128,6 +129,9 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
   real* savedMean = savedMean_->getData();
   real* savedInvVar = savedInvVar_->getData();
 
+  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
+  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
+
   auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
     Matrix::resizeOrCreate(m, h, w, false, true);
     m->zeroMem();
@@ -157,7 +161,7 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
                          gamma,
                          gammaGrad,
                          betaGrad,
-                         EPS,
+                         eps_,
                          savedMean,
                          savedInvVar);
 
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
index 413efd4d3ecd734b343efbcf8328ac0592daddda..1a3f0c0cbf8a1540e77cef70c753c91298728484 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <cudnn.h>
 #include "BatchNormBaseLayer.h"
 #include "Layer.h"
 #include "paddle/utils/Stat.h"
@@ -46,12 +47,9 @@ public:
   void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
-  /**
-   * Epsilon value used in the batch normalization formula.
-   * Minimum allowed value is CUDNN_BN_MIN_EPSILON defined in cudnn.h.
-   * Same epsilon value should be used in forward and backward functions.
-   */
-  static const double EPS;
+  /// Epsilon value used in the batch normalization formula.
+  /// Same epsilon value should be used in forward and backward functions.
+  double eps_;
 
   /// Input/output tensor descriptor desc
   hl_tensor_descriptor ioDesc_;
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..be26b9ba88c279036f73b0a0baaff164755fe067
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FactorizationMachineLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
+
+bool FactorizationMachineLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  factorSize_ = config_.factor_size();
+
+  /* initialize the latentVectors_ */
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t inputSize = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
+  latentVectors_ = std::unique_ptr<Weight>(
+      new Weight(inputSize, factorSize_, parameters_[0]));
+
+  return true;
+}
+
+void FactorizationMachineLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const MatrixPtr& inputV = getInputValue(0);
+
+  size_t batchSize = inputV->getHeight();
+  size_t outputSize = getSize();
+  size_t inputSize = inputLayers_[0]->getSize();
+  reserveOutput(batchSize, outputSize);
+
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(
+      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
+
+  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
+  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
+  inputMulFactor_->square2(*tmpOut_);
+  outV->sumRows(*tmpOut_, 0.5, 0);
+
+  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
+                                       inputV->getHeight(),
+                                       inputV->getWidth(),
+                                       inputV->getElementCnt(),
+                                       inputV->getValueType());
+    inputSquare_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
+  } else {
+    Matrix::resizeOrCreate(
+        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+    inputV->square2(*inputSquare_);
+  }
+  latentVectors_->getW()->square2(*latentVectorsSquare_);
+  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
+  outV->sumRows(*tmpOut_, -0.5, 1.0);
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  const MatrixPtr& inputV = getInputValue(0);
+  const MatrixPtr& oGrad = getOutputGrad();
+
+  Matrix::resizeOrCreate(
+      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
+                                         latentVectors_->getW()->getHeight(),
+                                         1,
+                                         false,
+                                         useGpu_);
+
+  /* Calculate the gradients of the latentVectors_ matrix */
+  if (latentVectors_->getWGrad()) {
+    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
+                                         inputV->getHeight(),
+                                         inputV->getWidth(),
+                                         inputV->getElementCnt());
+
+      CpuSparseMatrix* sparseInputV =
+          dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* sparseInputSquare =
+          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
+      CpuSparseMatrix* sparseTmpInput =
+          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
+      sparseTmpInput->copyFrom(*sparseInputV);
+
+      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
+
+      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
+      negOnes_->zeroMem();
+      negOnes_->add(-1);
+      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
+    } else {
+      Matrix::resizeOrCreate(
+          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+
+      tmpInput_->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
+      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
+
+      tmpSum_->sumCols(*tmpInput_, -1, 0);
+    }
+
+    latentVectors_->getWGrad()->addRowScale(
+        0, *latentVectors_->getW(), *tmpSumTrans);
+
+    /* Increasing the number of gradient */
+    latentVectors_->getParameterPtr()->incUpdate(callback);
+  }
+
+  /* Calculate the input layers gradient */
+  MatrixPtr inGrad = getInputGrad(0);
+  if (inGrad != NULL) {
+    inGrad->mul(
+        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
+    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum_);
+    inGrad->rowScale(0, *inGrad, *oGrad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..df20a49934d5dd444f127842c8fdb7c77f4ebeb1
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * @brief The Factorization Machine models pairwise (order-2) feature
+ * interactions as inner product of the learned latent vectors corresponding
+ * to each input feature.
+ *
+ * The Factorization Machine can effectively capture feature interactions
+ * especially when the input is sparse. While in principle FM can model higher
+ * order feature interaction, in practice usually only order-2 feature
+ * interactions are considered. The Factorization Machine Layer here only
+ * computes the order-2 interations with the formula:
+ *
+ * \f[
+ *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+ * \f]
+ *
+ * The detailed calculation for forward and backward can be found at this paper:
+ *
+ *     Factorization machines.
+ *
+ * The config file api is factorization_machine.
+ */
+
+class FactorizationMachineLayer : public Layer {
+protected:
+  // The latent vectors, shape: (size, factorSize_)
+  // Each row of the latentVectors_ matrix is the latent vector
+  // corresponding to one input feature dimension
+  std::unique_ptr<Weight> latentVectors_;
+  // The hyperparameter that defines the dimensionality of the factorization
+  size_t factorSize_;
+
+private:
+  // Store the square values of the letent vectors matrix
+  MatrixPtr latentVectorsSquare_;
+  // Store the square values of input matrix
+  MatrixPtr inputSquare_;
+  // The result of input matrix * latent vector matrix that will be used in
+  // both forward and backward step
+  MatrixPtr inputMulFactor_;
+  // Store temporary calculation result
+  MatrixPtr tmpOut_;
+  MatrixPtr tmpSum_;
+  MatrixPtr tmpInput_;
+  // Negative identity matrix
+  MatrixPtr negOnes_;
+
+public:
+  explicit FactorizationMachineLayer(const LayerConfig& config)
+      : Layer(config) {}
+  ~FactorizationMachineLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
index d62a8d846e5b347aa44ce1951c043d5813a5b3ff..236f8096bdb6e024cf3c9c73eba422616a777a23 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -64,49 +64,111 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
                          batchSize,
                          codeLength_,
                          /* trans */ false,
-                         useGpu(deviceId_));
+                         false);
   Matrix::resizeOrCreate(preOutput_.grad,
                          batchSize,
                          codeLength_,
                          /* trans */ false,
-                         useGpu(deviceId_));
-
+                         false);
   IVectorPtr label = getInput(*getLabelLayer()).ids;
-
   preOutput_.value->zeroMem();
 
+  if (useGpu_) {
+    Matrix::resizeOrCreate(cpuOutput_,
+                           output_.value->getHeight(),
+                           output_.value->getWidth(),
+                           /* trans */ false,
+                           false);
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+    cpuOutput_->copyFrom(*output_.value);
+  } else {
+    cpuOutput_ = output_.value;
+    cpuLabel_ = label;
+  }
   /* add the bias-vector */
   if (biases_.get() != NULL) {
-    preOutput_.value->addByBitCode(numClasses_, *label, *biases_->getW());
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_->getW());
+    } else {
+      cpuBias_ = biases_->getW();
+    }
+    preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
   }
   for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
     MatrixPtr input = getInputValue(i);
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuInput_,
+                             input->getHeight(),
+                             input->getWidth(),
+                             /* trans */ false,
+                             false);
+      Matrix::resizeOrCreate(cpuWeight_,
+                             weights_[i]->getW()->getHeight(),
+                             weights_[i]->getW()->getWidth(),
+                             /* trans */ false,
+                             false);
+      cpuInput_->copyFrom(*input);
+      cpuWeight_->copyFrom(*weights_[i]->getW());
+    } else {
+      cpuInput_ = input;
+      cpuWeight_ = weights_[i]->getW();
+    }
     preOutput_.value->mulByBitCode(
-        numClasses_, *label, *weights_[i]->getW(), *input);
+        numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
   }
   // keep consistent with the clipping in the following softrelu
   preOutput_.value->clip(-40.0, 40.0);
   preOutput_.value->sumByBitCode(numClasses_,
-                                 *label,
-                                 *output_.value,
+                                 *cpuLabel_,
+                                 *cpuOutput_,
                                  -1);  // scaleSum
   preOutput_.value->softrelu(*preOutput_.value);
-  MatrixPtr sum =
-      Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_));
+  MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
   preOutput_.value->rowSum(*sum);
-  output_.value->add(*sum);
+  cpuOutput_->add(*sum);
+  if (useGpu_) {
+    output_.value->copyFrom(*cpuOutput_);
+  } else {
+    output_.value = cpuOutput_;
+  }
 }
 
 void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
   IVectorPtr label = getInput(*getLabelLayer()).ids;
+  if (useGpu_) {
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+  } else {
+    cpuLabel_ = label;
+  }
   preOutput_.grad->one();
   preOutput_.grad->softreluDerivative(*preOutput_.value);
-  preOutput_.grad->subByBitCode(numClasses_, *label);
+  preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
 
   if (biases_ && biases_->getWGrad()) {
-    preOutput_.grad->addByBitCodeBackward(
-        numClasses_, *label, *biases_->getWGrad());
-
+    MatrixPtr biases_grad = biases_->getWGrad();
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_grad);
+    } else {
+      cpuBias_ = biases_grad;
+    }
+    preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
+    if (useGpu_) {
+      biases_grad->copyFrom(*cpuBias_);
+    } else {
+      biases_grad = cpuBias_;
+    }
     /* Increasing the number of gradient */
     biases_->getParameterPtr()->incUpdate(callback);
   }
@@ -115,9 +177,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
     /* Calculate the W-gradient for the current layer */
     MatrixPtr input = getInputValue(i);
     if (weights_[i]->getWGrad()) {
+      MatrixPtr weights_grad = weights_[i]->getWGrad();
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInput_,
+                               input->getHeight(),
+                               input->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeightGrad_,
+                               weights_grad->getHeight(),
+                               weights_grad->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInput_->copyFrom(*input);
+        cpuWeightGrad_->copyFrom(*weights_grad);
+      } else {
+        cpuInput_ = input;
+        cpuWeightGrad_ = weights_grad;
+      }
       preOutput_.grad->mulByBitCodeBackwardWeight(
-          numClasses_, *label, *weights_[i]->getWGrad(), *input);
-
+          numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
+      if (useGpu_) {
+        weights_grad->copyFrom(*cpuWeightGrad_);
+      } else {
+        weights_grad = cpuWeightGrad_;
+      }
       /* Increasing the number of gradient */
       weights_[i]->getParameterPtr()->incUpdate(callback);
     }
@@ -125,8 +209,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
     /* Calculate the input layers error */
     MatrixPtr inputGrad = getInputGrad(i);
     if (inputGrad) {
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInputGrad_,
+                               inputGrad->getHeight(),
+                               inputGrad->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeight_,
+                               weights_[i]->getW()->getHeight(),
+                               weights_[i]->getW()->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInputGrad_->copyFrom(*inputGrad);
+        cpuWeight_->copyFrom(*weights_[i]->getW());
+      } else {
+        cpuInputGrad_ = inputGrad;
+        cpuWeight_ = weights_[i]->getW();
+      }
       preOutput_.grad->mulByBitCodeBackwardError(
-          numClasses_, *label, *weights_[i]->getW(), *inputGrad);
+          numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
+      if (useGpu_) {
+        inputGrad->copyFrom(*cpuInputGrad_);
+      } else {
+        inputGrad = cpuInputGrad_;
+      }
     }
   }
 }
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 9afd40b1674680da962d6e51caa56b46279b70de..7f896e61ca26e3e22b99b65b1285384a121f7f02 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -80,6 +80,15 @@ protected:
   int codeLength_;
   /// temporary result of output_
   Argument preOutput_;
+
+  /// The temporary variables in CPU memory.
+  MatrixPtr cpuWeight_;
+  MatrixPtr cpuWeightGrad_;
+  MatrixPtr cpuInput_;
+  MatrixPtr cpuInputGrad_;
+  MatrixPtr cpuBias_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
index d66c361ae05e4a1089786e4620d2eb2218a8a29c..7faca0f8b7f54fa0a09e8fdab11064c8c26df375 100644
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@@ -21,8 +21,6 @@ namespace paddle {
 
 REGISTER_LAYER(mkldnn_batch_norm, MKLDNNBatchNormLayer);
 
-const real MKLDNNBatchNormLayer::EPS = 1E-5;
-
 bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
                                 const ParameterMap& parameterMap) {
   if (!MKLDNNLayer::init(layerMap, parameterMap)) {
@@ -50,6 +48,8 @@ bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
     useGlobalStats_ = config_.use_global_stats();
   }
   movingAvgFraction_ = config_.moving_average_fraction();
+  epsilon_ = config_.epsilon();
+
   VLOG(MKLDNN_BASE) << "--- " << (useGlobalStats_ ? "use" : "do not use")
                     << " --- global stats";
   VLOG(MKLDNN_BASE) << "Moving average fraction: " << movingAvgFraction_;
@@ -210,7 +210,7 @@ void MKLDNNBatchNormLayer::resetFwdPD(
   if (wgt) {
     flags_ = (flags_ | batch_normalization_flag::use_scale_shift);
   }
-  auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), EPS, flags_);
+  auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), epsilon_, flags_);
   pd.reset(new bn_fwd::primitive_desc(fwdDesc, engine_));
   CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
   if (wgt) {
@@ -277,7 +277,7 @@ void MKLDNNBatchNormLayer::resetBwdPD(
   }
   CHECK_PRIMITIVE_DESC_EQ(out, in->getPrimitiveDesc());
   auto md = in->getMemoryDesc();
-  auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, EPS, flags_);
+  auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, epsilon_, flags_);
   pd.reset(new bn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
   CHECK(pd->weights_primitive_desc() == fwdPD_->weights_primitive_desc());
   CHECK_PRIMITIVE_DESC_EQ(wgt, pd->diff_weights_primitive_desc());
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.h b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
index 387c58f02298b0441cc3bbbc4879eed6d892164c..1cf33cb34fa9cd7c9b8487a0a4a0011fb129e311 100644
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.h
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
@@ -32,7 +32,8 @@ protected:
   std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;
 
   // Epsilon value used in the batch normalization formula.
-  static const real EPS;
+  real epsilon_;
+
   // weight and bias in paddle
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> biases_;
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ac217f1363dbd0360645bbe07cd71a17cc931a79
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
@@ -0,0 +1,163 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNLRNLayer.h"
+#include "paddle/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_lrn, MKLDNNLRNLayer);
+
+bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  /* the size of inputs for norm-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  localSize_ = conf.size();
+  alpha_ = conf.scale();
+  beta_ = conf.pow();
+
+  ic_ = conf.channels();
+  oc_ = ic_;
+  iw_ = conf.img_size();
+  ow_ = conf.output_x();
+  ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  oh_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  CHECK_EQ(iw_, ow_);
+  CHECK_EQ(ih_, oh_);
+  return true;
+}
+
+void MKLDNNLRNLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  reshapeInput(bs, ih, iw);
+  // ic_ and oc can not be changed
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
+      << "Input channel can not be changed";
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNLRNLayer::resetFwd(std::vector<primitive>& pipeline,
+                              std::vector<MKLDNNMatrixPtr>& inputs,
+                              MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs[0], out);
+
+  resetFwdPD(fwdPD_, inputs[0], out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
+}
+
+void MKLDNNLRNLayer::resetBwd(std::vector<primitive>& pipeline,
+                              std::vector<MKLDNNMatrixPtr>& inputs,
+                              MKLDNNMatrixPtr& out) {
+  std::shared_ptr<lrn_bwd::primitive_desc> pd;
+
+  resetBwdBuffers(inputs[0], out);
+
+  resetBwdPD(pd, inputs[0], out);
+
+  resetBwdPipeline(pipeline, pd, inputs[0], out);
+}
+
+void MKLDNNLRNLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                     MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+  CHECK(in);
+  resetOutValue(out, in->getPrimitiveDesc());
+}
+
+void MKLDNNLRNLayer::resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                                MKLDNNMatrixPtr in,
+                                MKLDNNMatrixPtr out) {
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  auto fwdDesc = lrn_fwd::desc(pk,
+                               algorithm::lrn_across_channels,
+                               in->getMemoryDesc(),
+                               localSize_,
+                               alpha_,
+                               beta_,
+                               1.0f);
+  pd.reset(new lrn_fwd::primitive_desc(fwdDesc, engine_));
+  // prepare workspace if necessary
+  workspace_ =
+      passType_ != PASS_TEST
+          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
+          : nullptr;
+}
+
+void MKLDNNLRNLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  fwd_ = workspace_
+             ? std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *workspace_, *out))
+             : std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNLRNLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                     MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0] && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
+}
+
+void MKLDNNLRNLayer::resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                                MKLDNNMatrixPtr& in,
+                                MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  CHECK(out);
+  auto bwdDesc = lrn_bwd::desc(algorithm::lrn_across_channels,
+                               in->getMemoryDesc(),
+                               out->getMemoryDesc(),
+                               localSize_,
+                               alpha_,
+                               beta_,
+                               1.0f);
+  pd.reset(new lrn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNLRNLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  if (pd == nullptr) {
+    return;
+  }
+  CHECK(inVals_[0]);
+  CHECK(workspace_);
+  bwdData_ = std::make_shared<lrn_bwd>(
+      lrn_bwd(*pd, *inVals_[0], *out, *workspace_, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.h b/paddle/gserver/layers/MKLDNNLRNLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfe5621252c71a1de9a0a42a2a88e221e3e56972
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+typedef mkldnn::lrn_forward lrn_fwd;
+typedef mkldnn::lrn_backward lrn_bwd;
+
+/**
+ * @brief A subclass of MKLDNNLayer LRN(Local Response Norm) layer.
+ *
+ * The config file api is mkldnn_lrn
+ */
+class MKLDNNLRNLayer : public MKLDNNLayer {
+protected:
+  // save forward primitive_desc, which can be used in backward
+  std::shared_ptr<lrn_fwd::primitive_desc> fwdPD_;
+  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+  // test_lrn_backward.cpp, lrn need workspace for backward
+  std::shared_ptr<mkldnn::memory> workspace_;
+
+  int localSize_;
+  float alpha_, beta_;  // scale and pow in paddle
+
+public:
+  explicit MKLDNNLRNLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+
+  ~MKLDNNLRNLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+protected:
+  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr& in,
+                  MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
index 28969d01a13b7831794cef856af11ad2ec01c31e..2d0fff608c3932ede57ddcbb6a85cda255b77246 100644
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -132,6 +132,8 @@ void MKLDNNLayer::reshapeInput(int& batchsize,
   if (w != 0) {
     width = w;
   }
+  height = height != 0 ? height : 1;
+  width = width != 0 ? width : 1;
 }
 
 void MKLDNNLayer::reshapeOutput(size_t height, size_t width) {
@@ -294,22 +296,8 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
     srcs.push_back(*src);
   }
 
-  // TODO(TJ): remove me when mkldnn sum support different formats
-  for (size_t i = 1; i < srcPDs.size(); ++i) {
-    CHECK(srcPDs[0] == srcPDs[i]);
-  }
-  tmpOutGrad_ = out;
-  tmpCvt_ = nullptr;
-  if (out->getPrimitiveDesc() != srcPDs[0]) {
-    tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]);
-    tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
-    CHECK(tmpCvt_);
-    pipelineMergeGrad_.push_back(*tmpCvt_);
-  }
-
-  auto sumPD =
-      sum::primitive_desc(tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
-  mergeGrad_.reset(new sum(sumPD, srcs, *tmpOutGrad_));
+  auto sumPD = sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs);
+  mergeGrad_.reset(new sum(sumPD, srcs, *out));
   pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
 }
 
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 907927f984f1a7cd4a72038515569251df48d56f..3ba39f18b6ae4b38c17b3b72a359fd514e6dad5b 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -36,7 +36,7 @@ class MKLDNNLayer : public Layer {
 protected:
   // batch size
   int bs_;
-  // they sizes are always from the first input layer
+  // their sizes are always from the first input layer
   // input image channel, height and width
   int ic_, ih_, iw_;
   // output image channel, height and width
@@ -94,15 +94,12 @@ protected:
   std::vector<mkldnn::primitive> pipelineMergeGrad_;
   // tmp input argument to save input grad, only used to merge grad
   Argument tmpInArg_;
-  // since mkldnn sum do not support different formats:
-  // can refer to https://github.com/01org/mkl-dnn/issues/134
-  // so need create reorder manually and save tmp MKLDNNMatrix
-  MKLDNNMatrixPtr tmpOutGrad_;
-  std::shared_ptr<mkldnn::primitive> tmpCvt_;
 
 public:
   explicit MKLDNNLayer(const LayerConfig& config)
       : Layer(config),
+        ih_(0),
+        iw_(0),
         condition_(0),
         needResetBwd_(true),
         outputOnlyMKLDNN_(false),
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd75555fae134664d92ba9f8ffdea8af78166b7e
--- /dev/null
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
@@ -0,0 +1,132 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLPackedRecurrentLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(mkl_packed_recurrent, MKLPackedRecurrentLayer);
+
+bool MKLPackedRecurrentLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+  if (!RecurrentLayer::init(layerMap, parameterMap)) return false;
+  packed_weight_.reset(new MKLPackedWeight(weight_->getW()));
+  packed_weight_->pack();
+  if (needGradient_) {
+    packed_weightT_.reset(new MKLPackedWeight(weight_->getW(), true));
+    packed_weightT_->pack();
+  }
+  return true;
+}
+
+void MKLPackedRecurrentLayer::backward(const UpdateCallback& callback) {
+  RecurrentLayer::backward(callback);
+  packed_weight_->pack();
+  if (needGradient_) {
+    packed_weightT_->pack();
+  }
+}
+
+void MKLPackedRecurrentLayer::forwardBatch(int batchSize,
+                                           size_t numSequences,
+                                           const int* starts) {
+  if (!batchValue_) {
+    batchValue_.reset(new SequenceToBatch(useGpu_));
+  }
+
+  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
+
+  batchValue_->copyFromSeq(*output_.value);
+
+  {
+    REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str());
+    /* forward one batch */
+    for (size_t n = 0; n < batchValue_->getNumBatch(); n++) {
+      MatrixPtr batchValue = batchValue_->getBatchValue(n);
+
+      if (n != 0) {
+        MatrixPtr preBatchValue =
+            batchValue_->getBatchValue(n - 1, batchValue->getHeight());
+
+        packed_weight_->gemm_compute(preBatchValue, batchValue);
+      }
+      Argument arg;
+      arg.value = batchValue;
+      activation_->forward(arg).check();
+    }
+  }
+  batchValue_->copyBackSeq(*output_.value);
+}
+
+void MKLPackedRecurrentLayer::backwardBatch(int batchSize,
+                                            size_t numSequences,
+                                            const int* starts) {
+  if (!batchGrad_) {
+    batchGrad_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchGrad_->shareIndexWith(*batchValue_);
+
+  size_t numBatch = batchGrad_->getNumBatch();
+  bool backwardByBatch = numBatch < numSequences;
+
+  batchGrad_->copyFromSeq(*output_.grad);
+  {
+    REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str());
+    /* backward one batch */
+    for (int n = (int)numBatch - 1; n >= 0; n--) {
+      MatrixPtr batchGrad = batchGrad_->getBatchValue(n);
+      MatrixPtr batchValue =
+          batchValue_->getBatchValue(n, batchGrad->getHeight());
+
+      Argument arg;
+      arg.value = batchValue;
+      arg.grad = batchGrad;
+      activation_->backward(arg).check();
+
+      if (n != 0) {
+        batchValue = batchGrad_->getBatchValue(n - 1, batchGrad->getHeight());
+        packed_weightT_->gemm_compute(batchGrad, batchValue);
+      }
+
+      if (backwardByBatch && weight_->getWGrad()) {
+        if (n != 0) {
+          /* backward weight */
+          batchValue =
+              batchValue_->getBatchValue(n - 1, batchGrad->getHeight());
+          weight_->getWGrad()->mul(
+              *batchValue->getTranspose(), *batchGrad, 1, 1);
+        }
+      }
+    }
+  }
+
+  batchGrad_->copyBackSeq(*output_.grad);
+
+  if (!backwardByBatch && weight_->getWGrad()) {
+    REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str());
+    for (size_t seq = 0; seq < numSequences; ++seq) {
+      int len = starts[seq + 1] - starts[seq];
+      weight_->getWGrad()->mul(
+          *output_.value
+               ->subMatrix(reversed_ ? starts[seq] + 1 : starts[seq], len - 1)
+               ->getTranspose(),
+          *output_.grad->subMatrix(reversed_ ? starts[seq] : starts[seq] + 1,
+                                   len - 1),
+          1,
+          1);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..bded523a8fbd6ff18f28859bd2a1bf3c1a25e2a0
--- /dev/null
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLPackedWeight.h"
+#include "RecurrentLayer.h"
+
+DECLARE_bool(rnn_use_batch);
+
+namespace paddle {
+
+/**
+ * @brief MKLPackedRecurrentLayer is almost the same with RecurrentLayer
+ * but is optimized with MKL cblas packed gemm.
+ * More details:
+ * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/mkl/mkl_packed.md
+ */
+
+class MKLPackedRecurrentLayer : public RecurrentLayer {
+public:
+  explicit MKLPackedRecurrentLayer(const LayerConfig& config)
+      : RecurrentLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+protected:
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int* starts) override;
+
+  void backwardBatch(int batchSize,
+                     size_t numSequences,
+                     const int* starts) override;
+
+protected:
+  /// packed_weight_ contains same data with
+  /// RecurrentLayer::weight_ but is packed
+  std::unique_ptr<MKLPackedWeight> packed_weight_;
+  /// packed_weightT_ is the transposition matrix of packed_weight_
+  std::unique_ptr<MKLPackedWeight> packed_weightT_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h
new file mode 100644
index 0000000000000000000000000000000000000000..15d5093beb43e2f086601c2616ace033da34f341
--- /dev/null
+++ b/paddle/gserver/layers/MKLPackedWeight.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/math/MathFunctions.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/parameter/Weight.h"
+
+namespace paddle {
+
+class MKLPackedWeight {
+protected:
+  /// The pointer of weight
+  real *weight_;
+  /// The pointer of cblas packed gemm to weight
+  real *packedWeight_;
+  size_t height_;
+  size_t width_;
+  bool transW_;
+
+public:
+  explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) {
+    packedWeight_ = nullptr;
+    weight_ = weight->getData();
+    height_ = weight->getHeight();
+    width_ = weight->getWidth();
+    transW_ = transW;
+  }
+
+  ~MKLPackedWeight() { free_(); }
+
+  void pack() { pack_(weight_); }
+
+  void gemm_compute(const MatrixPtr src, MatrixPtr dst) {
+    cblas_sgemm_compute(CblasRowMajor,
+                        CblasNoTrans,
+                        CblasPacked,
+                        src->getHeight(),
+                        transW_ ? height_ : width_,
+                        transW_ ? width_ : height_,
+                        src->getData(),
+                        src->getWidth(),
+                        packedWeight_,
+                        width_,
+                        1.0,
+                        dst->getData(),
+                        dst->getWidth());
+  }
+
+protected:
+  void pack_(real *src) {
+    if (!packedWeight_) {
+      packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
+    }
+    cblas_sgemm_pack(CblasRowMajor,
+                     CblasBMatrix,
+                     transW_ ? CblasTrans : CblasNoTrans,
+                     1,
+                     transW_ ? height_ : width_,
+                     transW_ ? width_ : height_,
+                     1.0,
+                     src,
+                     width_,
+                     packedWeight_);
+  }
+
+  void free_() {
+    if (packedWeight_) {
+      cblas_sgemm_free(packedWeight_);
+    }
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 87613a96c5b3c2da212f63e9e678bcd22308b08e..fceb389d06d8d2cb0357186bf83edda9957c6c19 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -45,6 +45,8 @@ bool PoolLayer::init(const LayerMap& layerMap,
   strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
   confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
   outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+
+  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
   return true;
 }
 
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index d43292ad2d4bbe1229ca59ca21bee92c9ec006a3..9df672a935868e9c61f4dd1fd47a9c309b214f12 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -38,6 +38,8 @@ protected:
 
   std::string poolType_;
 
+  bool excludeMode_;
+
 public:
   explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
 
diff --git a/paddle/gserver/layers/PoolProjection.cpp b/paddle/gserver/layers/PoolProjection.cpp
index d90b438448eb72e72e22e9a91a3cbcd84ac7e6cb..6a9de394cee3769784a38f5512b15f52b1ed6fa1 100644
--- a/paddle/gserver/layers/PoolProjection.cpp
+++ b/paddle/gserver/layers/PoolProjection.cpp
@@ -36,6 +36,8 @@ PoolProjection::PoolProjection(const ProjectionConfig& config,
   strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
   confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
   outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+
+  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
 }
 
 size_t PoolProjection::getSize() {
@@ -141,7 +143,8 @@ void AvgPoolProjection::forward() {
                        outputY_,
                        outputX_,
                        confPaddingY_,
-                       confPadding_);
+                       confPadding_,
+                       excludeMode_);
 }
 
 void AvgPoolProjection::backward(const UpdateCallback& callback) {
@@ -166,6 +169,7 @@ void AvgPoolProjection::backward(const UpdateCallback& callback) {
                              1,
                              1,
                              confPaddingY_,
-                             confPadding_);
+                             confPadding_,
+                             excludeMode_);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h
index 9a75f465f6fbb2f2a928b0e36fcfbe0e510d7b3a..a0412714bca7a273e999e4d6bd552e833d20d69c 100644
--- a/paddle/gserver/layers/PoolProjection.h
+++ b/paddle/gserver/layers/PoolProjection.h
@@ -28,6 +28,7 @@ protected:
   int confPaddingY_, confPadding_;
   size_t channels_;
   std::string poolType_;
+  bool excludeMode_;
 
 public:
   PoolProjection(const ProjectionConfig& config,
diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/gserver/layers/ROIPoolLayer.cpp
index 02402894d3354a6af221948a3360ef830881bf39..7d7c30b4d89e2dd137e7fc7de3159c07bbab9fb4 100644
--- a/paddle/gserver/layers/ROIPoolLayer.cpp
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "ROIPoolLayer.h"
+#include <cfloat>
 
 namespace paddle {
 
@@ -83,12 +84,15 @@ void ROIPoolLayer::forward(PassType passType) {
   size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
 
   real* outputData = outputValue->getData();
-  Matrix::resizeOrCreate(maxIdxs_,
-                         numROIs,
-                         channels_ * pooledHeight_ * pooledWidth_,
-                         false,
-                         false);
-  real* argmaxData = maxIdxs_->getData();
+  real* argmaxData = nullptr;
+  if (passType != PASS_TEST) {
+    Matrix::resizeOrCreate(maxIdxs_,
+                           numROIs,
+                           channels_ * pooledHeight_ * pooledWidth_,
+                           false,
+                           false);
+    argmaxData = maxIdxs_->getData();
+  }
 
   for (size_t n = 0; n < numROIs; ++n) {
     // the first five elememts of each RoI should be:
@@ -126,8 +130,8 @@ void ROIPoolLayer::forward(PassType passType) {
 
           bool isEmpty = (hend <= hstart) || (wend <= wstart);
           size_t poolIndex = ph * pooledWidth_ + pw;
-          if (isEmpty) {
-            outputData[poolIndex] = 0;
+          outputData[poolIndex] = isEmpty ? 0 : -FLT_MAX;
+          if (argmaxData) {
             argmaxData[poolIndex] = -1;
           }
 
@@ -136,7 +140,9 @@ void ROIPoolLayer::forward(PassType passType) {
               size_t index = h * width_ + w;
               if (batchData[index] > outputData[poolIndex]) {
                 outputData[poolIndex] = batchData[index];
-                argmaxData[poolIndex] = index;
+                if (argmaxData) {
+                  argmaxData[poolIndex] = index;
+                }
               }
             }
           }
@@ -144,7 +150,9 @@ void ROIPoolLayer::forward(PassType passType) {
       }
       batchData += channelOffset;
       outputData += poolChannelOffset;
-      argmaxData += poolChannelOffset;
+      if (argmaxData) {
+        argmaxData += poolChannelOffset;
+      }
     }
     bottomROIs += roiOffset;
   }
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
index e4c2b483d2fa4032735858dab17647592791a9c7..6bd42c06cadf755e8703f3fc299d0e6248dd1478 100644
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -12,119 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gflags/gflags.h>
-#include "Layer.h"
-#include "SequenceToBatch.h"
-#include "paddle/utils/Stat.h"
+#include "RecurrentLayer.h"
 
 DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
 
 namespace paddle {
 
-/**
- * @brief RecurrentLayer takes 1 input layer. The output size is the same with
- * input layer.
- * For each sequence [start, end] it performs the following computation:
- * \f[
- *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
- *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
- *
- * \f]
- * If reversed is true, the order is reversed:
- * \f[
- *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
- *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
- * \f]
- * There are two methods to calculate rnn. One way is to compute rnn one
- * sequence by one sequence. The other way is to reorganize the input
- * into batches, then compute rnn one batch by one batch. Users can select
- * them by rnn_use_batch flag.
- */
-
-class RecurrentLayer : public Layer {
-public:
-  explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
-protected:
-  /**
-   * @brief If user do not set --rnn_use_batch=true, it will
-   * compute rnn forward one sequence by one sequence in default.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn forward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void forwardOneSequence(int start, int length);
-  /**
-   * @brief Compute rnn backward one sequence by onesequence.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn backward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void backwardOneSequence(int start, int length);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch. It will convert batch shape to sequence after finishing forward.
-   * The batch info can refer to SequenceToBatch class.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardBatch(int batchSize, size_t numSequences, const int* starts);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardBatch(int batchSize, size_t numSequences, const int* starts);
-
-protected:
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> bias_;
-
-  /// frameOutput_[i] is used to hold the i-th sample of output_
-  std::vector<Argument> frameOutput_;
-  MatrixPtr prevOutput_;
-  /// Whether compute rnn by reverse.
-  bool reversed_;
-  /// If compute batch by batch, batchValue_ will be used to save the
-  /// reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  /// If compute batch by batch, batchGrad_ will be used to save the
-  /// gradient with respect to reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-};
-
 REGISTER_LAYER(recurrent, RecurrentLayer);
 
 bool RecurrentLayer::init(const LayerMap& layerMap,
@@ -260,7 +153,6 @@ void RecurrentLayer::backward(const UpdateCallback& callback) {
     bias_->getWGrad()->collectBias(*output_.grad, 1);
     bias_->getParameterPtr()->incUpdate(callback);
   }
-
   weight_->getParameterPtr()->incUpdate(callback);
 }
 
diff --git a/paddle/gserver/layers/RecurrentLayer.h b/paddle/gserver/layers/RecurrentLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..f40dbe150fa93becfc26f6ea9e55e40eaf208860
--- /dev/null
+++ b/paddle/gserver/layers/RecurrentLayer.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <gflags/gflags.h>
+#include "Layer.h"
+#include "SequenceToBatch.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief RecurrentLayer takes 1 input layer. The output size is the same with
+ * input layer.
+ * For each sequence [start, end] it performs the following computation:
+ * \f[
+ *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
+ *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
+ *
+ * \f]
+ * If reversed is true, the order is reversed:
+ * \f[
+ *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
+ *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
+ * \f]
+ * There are two methods to calculate rnn. One way is to compute rnn one
+ * sequence by one sequence. The other way is to reorganize the input
+ * into batches, then compute rnn one batch by one batch. Users can select
+ * them by rnn_use_batch flag.
+ */
+
+class RecurrentLayer : public Layer {
+public:
+  explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+  void resetState() override;
+
+  void setState(LayerStatePtr state) override;
+
+  LayerStatePtr getState() override;
+
+protected:
+  /**
+   * @brief If user do not set --rnn_use_batch=true, it will
+   * compute rnn forward one sequence by one sequence in default.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn forward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
+  void forwardOneSequence(int start, int length);
+  /**
+   * @brief Compute rnn backward one sequence by onesequence.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn backward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
+  void backwardOneSequence(int start, int length);
+
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch. It will convert batch shape to sequence after finishing forward.
+   * The batch info can refer to SequenceToBatch class.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  virtual void forwardBatch(int batchSize,
+                            size_t numSequences,
+                            const int* starts);
+
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  virtual void backwardBatch(int batchSize,
+                             size_t numSequences,
+                             const int* starts);
+
+protected:
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> bias_;
+
+  /// frameOutput_[i] is used to hold the i-th sample of output_
+  std::vector<Argument> frameOutput_;
+  MatrixPtr prevOutput_;
+  /// Whether compute rnn by reverse.
+  bool reversed_;
+  /// If compute batch by batch, batchValue_ will be used to save the
+  /// reorganized input value.
+  std::unique_ptr<SequenceToBatch> batchValue_;
+  /// If compute batch by batch, batchGrad_ will be used to save the
+  /// gradient with respect to reorganized input value.
+  std::unique_ptr<SequenceToBatch> batchGrad_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceToBatch.cpp b/paddle/gserver/layers/SequenceToBatch.cpp
index 5fa7b6f4881b9582b540a5b1bfe849220cc2a4ea..6b769378d24838364701d0f128a7308c6195cc41 100644
--- a/paddle/gserver/layers/SequenceToBatch.cpp
+++ b/paddle/gserver/layers/SequenceToBatch.cpp
@@ -171,12 +171,31 @@ void SequenceToBatch::sequence2BatchCopy(Matrix &batch,
     hl_sequence2batch_copy(
         batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
   } else {
-    for (int i = 0; i < batchCount; ++i) {
-      if (seq2batch) {
+    if (seq2batch) {
+#ifdef PADDLE_USE_MKLML
+      const int blockMemSize = 8 * 1024;
+      const int blockSize = blockMemSize / sizeof(real);
+#pragma omp parallel for collapse(2)
+      for (int i = 0; i < batchCount; ++i) {
+        for (int j = 0; j < seqWidth; j += blockSize) {
+          memcpy(batch.rowBuf(i) + j,
+                 sequence.rowBuf(idxData[i]) + j,
+                 (j + blockSize > seqWidth) ? (seqWidth - j) * sizeof(real)
+                                            : blockMemSize);
+        }
+      }
+#else
+      for (int i = 0; i < batchCount; ++i) {
         memcpy(batch.rowBuf(i),
                sequence.rowBuf(idxData[i]),
                seqWidth * sizeof(real));
-      } else {
+      }
+#endif
+    } else {
+#ifdef PADDLE_USE_MKLML
+#pragma omp parallel for
+#endif
+      for (int i = 0; i < batchCount; ++i) {
         memcpy(sequence.rowBuf(idxData[i]),
                batch.rowBuf(i),
                seqWidth * sizeof(real));
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index c295ea19c9ccb3d05c509a41925d2c36efdba8ef..b578a906c2027a1169a0098b93f8d0742920f99d 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -1,5 +1,4 @@
 # gserver pacakge unittests
-
 add_simple_unittest(test_LinearChainCRF)
 add_simple_unittest(test_RecurrentLayer)
 
@@ -29,6 +28,26 @@ gserver_test(test_KmaxSeqScore)
 gserver_test(test_Expand)
 gserver_test(test_MaxPoolingWithMaskOutput)
 
+set(PYTHON_PATH 
+   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
+   ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/gserver/tests)
+function(gserver_test_with_python TARGET)
+  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
+  add_test(NAME ${TARGET}
+    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endfunction()
+
+gserver_test_with_python(test_PyDataProvider2)
+if(WITH_PYTHON)
+    gserver_test_with_python(test_PyDataProvider)
+endif()
+if(NOT MOBILE_INFERENCE)
+    gserver_test_with_python(test_CompareTwoNets)
+    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it.
+    gserver_test_with_python(test_RecurrentGradientMachine)
+endif()
+
 ########## test_MKLDNN layers and activations ##########
 if(WITH_MKLDNN)
     add_unittest_without_exec(test_MKLDNN
@@ -36,87 +55,43 @@ if(WITH_MKLDNN)
         MKLDNNTester.cpp
         LayerGradUtil.cpp)
     add_test(NAME test_MKLDNN
-        COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python
-            ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
+        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
             WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 
-############## test_PyDataProvider ########################
-if(WITH_PYTHON)
-    add_unittest_without_exec(test_PyDataProvider
-        test_PyDataProvider.cpp)
-
-    add_test(NAME test_PyDataProvider
-        COMMAND .set_python_path.sh -d ./gserver/tests:${PADDLE_SOURCE_DIR}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-endif()
-
 ############### test_WarpCTCLayer #######################
 if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
     add_unittest_without_exec(test_WarpCTCLayer
         test_WarpCTCLayer.cpp)
-
     add_test(NAME test_WarpCTCLayer
         COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
         WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 
 if(NOT MOBILE_INFERENCE)
-################## test_Evaluator #######################
+    ################## test_Evaluator #############
     add_unittest(test_Evaluator
         test_Evaluator.cpp)
       
-############### test_RecurrentGradientMachine ###############
-    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
-    # I will fix it.
-    add_unittest_without_exec(test_RecurrentGradientMachine
-        test_RecurrentGradientMachine.cpp)
-    add_test(NAME test_RecurrentGradientMachine
-        COMMAND .set_python_path.sh -d
-                ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
-                ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-      
-############### test_NetworkCompare ###############
+    ########### test_NetworkCompare ###############
     add_unittest_without_exec(test_NetworkCompare
         test_NetworkCompare.cpp)
     if(WITH_GPU)
-        add_test(NAME test_NetworkCompare
-            COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
-            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+        set(use_gpu true)
     else()
-        add_test(NAME test_NetworkCompare
-            COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
-            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+        set(use_gpu false)
     endif()
-endif()
-
-
-add_unittest_without_exec(test_PyDataProvider2
-        test_PyDataProvider2.cpp)
-
-add_test(NAME test_PyDataProvider2
-   COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
-)
+    add_test(NAME test_NetworkCompare
+        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 
-################# test_CompareSparse ##################
-add_unittest_without_exec(test_CompareSparse
-    test_CompareSparse.cpp)
-if(NOT ON_TRAVIS)
-  add_test(NAME test_CompareSparse
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
-          ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
-              ./.set_port.sh -p port -n 6
-                  ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+    ############ test_CompareSparse ################
+    add_unittest_without_exec(test_CompareSparse
+        test_CompareSparse.cpp)
+    if(NOT ON_TRAVIS)
+      add_test(NAME test_CompareSparse
+        COMMAND ${PYTHON_PATH} ./.set_port.sh -p port -n 6
+                ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+    endif()
 endif()
-
-################ test_CompareTwoNets ######################
-add_unittest_without_exec(test_CompareTwoNets
-    test_CompareTwoNets.cpp)
-add_test(NAME test_CompareTwoNets
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
-        ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
diff --git a/paddle/gserver/tests/img_conv_cudnn.py b/paddle/gserver/tests/img_conv_cudnn.py
index 3934607fa41f9b6d401f1c9ff4aec6715786799b..e424261bda2c0fea4b1796ce443810e8e4e4599b 100644
--- a/paddle/gserver/tests/img_conv_cudnn.py
+++ b/paddle/gserver/tests/img_conv_cudnn.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #edit-mode: -*- python -*-
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
diff --git a/paddle/gserver/tests/img_conv_exconv.py b/paddle/gserver/tests/img_conv_exconv.py
index ad5a8ba2bde17000ca3d7057c6f399ae28d938b0..3b59cd80b8b8349f022b774b36d4e4db2d517b73 100644
--- a/paddle/gserver/tests/img_conv_exconv.py
+++ b/paddle/gserver/tests/img_conv_exconv.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #edit-mode: -*- python -*-
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
diff --git a/paddle/gserver/tests/mkldnn_simple_net.conf b/paddle/gserver/tests/mkldnn_simple_net.conf
index 8bbe91e56d0ba6da06475ad16f3162ee1103ee02..0e9d6b31fa8776136b4eee29311383ae6bb21644 100644
--- a/paddle/gserver/tests/mkldnn_simple_net.conf
+++ b/paddle/gserver/tests/mkldnn_simple_net.conf
@@ -51,6 +51,8 @@ tmp = img_pool_layer(input=tmp,
             padding=1,
             pool_type=MaxPooling())
 
+tmp = img_cmrnorm_layer(input=tmp, size=5, scale=0.0001, power=0.75)
+
 tmp = fc_layer(input=tmp,
             size=channels,
             bias_attr=False,
diff --git a/paddle/gserver/tests/sequence_recurrent.py b/paddle/gserver/tests/sequence_recurrent.py
index 4895df186bfecc5cb5263676a9cd5bac5039d565..d20d3331ae6887aa3bca9df7e7817e08ebb91a48 100644
--- a/paddle/gserver/tests/sequence_recurrent.py
+++ b/paddle/gserver/tests/sequence_recurrent.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/env python
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
diff --git a/paddle/gserver/tests/sequence_rnn_matched_inputs.py b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
index e2635b4400b13517bac716a5a0affeb16c218b09..b156fa9baaf386a8625d63020d72828047ddca2f 100644
--- a/paddle/gserver/tests/sequence_rnn_matched_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 # edit-mode: -*- python -*-
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
@@ -41,7 +54,7 @@ nonseq = embedding_layer(input=label, size=word_dim)
 
 
 # This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_multi_unequalength_inputs.conf
+# sequence_rnn_mixed_inputs.conf
 def outer_step(subseq, seq, nonseq, encoding):
     outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
 
diff --git a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
index 84a66e294495c01e03dc83b38a531e482bed1292..c2c98aae5f5c5f73c61b309382f2e3d7d649ef47 100644
--- a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 # edit-mode: -*- python -*-
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
@@ -37,7 +50,7 @@ encoding = embedding_layer(input=data2, size=word_dim)
 
 
 # This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_multi_unequalength_inputs.conf
+# sequence_rnn_matched_inputs.conf
 def outer_step(subseq, seq, nonseq, encoding):
     outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
 
diff --git a/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py b/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
index 786a0c6d780e4e8deadb35e52901e42dae67a281..c812c6ced2458b6c40e141606655aa45e2bbc55a 100644
--- a/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #edit-mode: -*- python -*-
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index cacf10692942f5eca2f6c498183f4acc00768460..ba83667ebc9a89c37f77a7f71e6df90b54723cc0 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -238,9 +238,24 @@ void testProjectionConv(size_t groups, bool isDeconv) {
                             /* caffeMode */ true);
   conv->set_output_x(output_x);
   conv->set_output_y(output_y);
+  LOG(INFO) << "DILATION:" << DILATION << "; output_x: " << output_x
+            << "; output_y: " << output_y;
   if (isDeconv) {
+    int deconv_image_x = imageSize(output_x,
+                                   (conv->filter_size() - 1) * DILATION + 1,
+                                   conv->padding(),
+                                   conv->stride(),
+                                   /* caffeMode */ true);
+    int deconv_image_y = imageSize(output_y,
+                                   (conv->filter_size_y() - 1) * DILATION + 1,
+                                   conv->padding_y(),
+                                   conv->stride_y(),
+                                   /* caffeMode */ true);
+
+    LOG(INFO) << " deconv_image_x: " << deconv_image_x
+              << "; deconv_image_y: " << deconv_image_y;
     conf.set_input_size(output_x * output_y * CHANNELS);
-    conf.set_output_size(IMAGE_SIZE * IMAGE_SIZE * NUM_FILTERS);
+    conf.set_output_size(deconv_image_x * deconv_image_y * NUM_FILTERS);
   } else {
     conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
     conf.set_output_size(output_x * output_y * NUM_FILTERS);
@@ -681,12 +696,13 @@ TEST(Layer, hsigmoidLayer) {
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
 
-  // Not support GPU now
-  testLayerGrad(config,
-                "hsigmoid",
-                100,
-                /* trans */ false, /* useGpu */
-                false);
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "hsigmoid",
+                  100,
+                  /* trans */ false,
+                  /* useGpu */ useGpu);
+  }
 }
 
 TEST(Layer, multi_cross) {
@@ -1210,7 +1226,10 @@ void setPoolConfig(TestConfig* config,
   pool->set_output_y(oh);
 }
 
-void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
+void testPoolLayer(const string& poolType,
+                   bool trans,
+                   bool useGpu,
+                   bool excludeMode = true) {
   TestConfig config;
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
   LayerInputConfig* input = config.layerConfig.add_inputs();
@@ -1218,6 +1237,7 @@ void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
 
   pool->set_img_size(14);
   pool->set_img_size_y(14);
+  pool->set_exclude_mode(excludeMode);
   setPoolConfig(&config, pool, poolType);
   config.layerConfig.set_size(pool->output_x() * pool->output_y() *
                               pool->channels());
@@ -1249,16 +1269,26 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
 
 TEST(Layer, PoolLayer) {
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ false,
+                /* excludeMode= */ false);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
   testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
 
 #ifdef PADDLE_WITH_CUDA
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ true,
+                /* excludeMode= */ false);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2(
+      "cudnn-avg-incl-pad-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
@@ -1442,7 +1472,8 @@ TEST(Layer, RecurrentLayer) {
     for (auto reversed : {false, true}) {
       config.layerConfig.set_reversed(reversed);
       config.testState = !reversed;
-      testLayerGrad(config, "recurrent", 50, /* trans= */ false, useGpu);
+      testLayerGrad(
+          config, "recurrent", 50, /* trans= */ false, useGpu, false, 1.0);
     }
   }
 }
@@ -1464,7 +1495,8 @@ TEST(Layer, LstmLayer) {
     for (auto reversed : {false, true}) {
       config.layerConfig.set_reversed(reversed);
       config.testState = !reversed;
-      testLayerGrad(config, "lstmemory", 100, /* trans= */ false, useGpu);
+      testLayerGrad(
+          config, "lstmemory", 100, /* trans= */ false, useGpu, false, 0.02);
     }
   }
   for (auto useGpu : {true}) {
@@ -2464,6 +2496,25 @@ TEST(Layer, L2DistanceLayer) {
   }
 }
 
+void testFactorizationMachineLayer(InputType type, bool useGpu) {
+  const int FACTOR_SIZE = 10;
+  TestConfig config;
+  config.layerConfig.set_type("factorization_machine");
+  config.layerConfig.set_factor_size(FACTOR_SIZE);
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+  config.inputDefs.push_back({type, "layer_0", 128, 1280});
+  config.layerConfig.add_inputs();
+  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
+}
+
+TEST(Layer, FactorizationMachineLayer) {
+  for (auto useGpu : {false, true}) {
+    testFactorizationMachineLayer(INPUT_DATA, useGpu);
+  }
+  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index 42644e9601a82ea81c417adc6441edeb036998e2..ad1dbc3ee2bfd00a94de06f1e1b2ffe64f19b417 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -272,6 +272,51 @@ TEST(MKLDNNLayer, BatchNormLayer) {
   testBatchNormLayer({4, 16, 8, 10});
 }
 
+struct testLRNDesc {
+  int bs, ic, ih, iw;
+  float scale, pow;
+  int localSize;
+};
+
+void getMKLDNNLRNConfig(TestConfig& cfg, const testLRNDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_lrn");
+  cfg.layerConfig.set_active_type("relu");
+  size_t layerSize = pm.ic * pm.ih * pm.iw;
+  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_channels(pm.ic);
+  norm->set_size(pm.localSize);
+  norm->set_scale(pm.scale);
+  norm->set_pow(pm.pow);
+  norm->set_blocked(0);
+  norm->set_img_size(pm.iw);
+  norm->set_img_size_y(pm.ih);
+  norm->set_output_x(norm->img_size());
+  norm->set_output_y(norm->img_size_y());
+  cfg.layerConfig.set_size(layerSize);
+  cfg.biasSize = 0;
+}
+
+void testLRNLayer(const testLRNDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNLRNConfig(dnnConfig, pm);
+  // mkldnn_lrn <==> norm with cmrnorm-projection type
+  TestConfig refConfig = dnnConfig;
+  refConfig.layerConfig.set_type("norm");
+  LayerInputConfig* input = refConfig.layerConfig.mutable_inputs(0);
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cmrnorm-projection");
+  norm->set_scale(norm->scale() / norm->size());
+  RUN_MKLDNN_TEST(dnnConfig, refConfig, pm)
+}
+
+TEST(MKLDNNLayer, LRNLayer) {
+  testLRNLayer({4, 10, 12, 12, 0.001f, 0.75f, 5});
+  testLRNLayer({2, 32, 6, 6, 0.001f, 0.75f, 5});
+  testLRNLayer({4, 16, 8, 10, 0.01f, 0.5f, 5});
+}
+
 struct testImageDesc {
   int bs, ic, ih, iw;
 };
@@ -315,7 +360,7 @@ TEST(MKLDNNLayer, AddtoLayer) {
 
 static void getMKLDNNConcatConfig(TestConfig& cfg,
                                   const std::vector<testImageDesc>& inputs) {
-  CHECK_GE(inputs.size(), 2) << "at least two inputs";
+  CHECK_GE(inputs.size(), 2UL) << "at least two inputs";
   int oc = inputs[0].ic;
   for (size_t i = 1; i < inputs.size(); ++i) {
     CHECK_EQ(inputs[i].bs, inputs[0].bs);
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index 16ab0e6aecb6a895b20389992a44dc542eb3b00a..0e130843339a1030f86f4d48891499bac20e9ca2 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -222,6 +222,7 @@ TEST(Layer, RecurrentLayer) {
 #define protected public
 #include "paddle/gserver/layers/GatedRecurrentLayer.h"
 #include "paddle/gserver/layers/LstmLayer.h"
+#include "paddle/gserver/layers/RecurrentLayer.h"
 template <class T>
 class TestRecurrentLayer {
 public:
@@ -420,12 +421,151 @@ TEST(Layer, LstmLayer) {
   }
 }
 
+#ifdef PADDLE_WITH_MKLML
+
+#include "paddle/gserver/layers/MKLPackedRecurrentLayer.h"
+
+LayerPtr initMKLPackedLayer(LayerConfig layerConfig,
+                            bool reversed,
+                            int layerSize,
+                            LayerPtr dataLayer,
+                            ParameterPtr para,
+                            ParameterPtr bias = nullptr) {
+  LayerMap layerMap;
+  ParameterMap parameterMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  parameterMap[para->getName()] = para;
+  if (bias) {
+    parameterMap[bias->getName()] = bias;
+    layerConfig.set_bias_parameter_name("bias_0");
+  }
+
+  layerConfig.set_size(layerSize);
+  layerConfig.set_reversed(reversed);
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+
+  LayerPtr testLayer = Layer::create(layerConfig);
+  layerMap[testLayer->getName()] = testLayer;
+
+  testLayer->init(layerMap, parameterMap);
+  testLayer->setNeedGradient(true);
+
+  return testLayer;
+}
+
+void checkMKLPackedLayer(LayerConfig layerConfig1,
+                         LayerConfig layerConfig2,
+                         bool reversed,
+                         int layerSize,
+                         int batchSize,
+                         bool useBatch1,
+                         bool useBatch2) {
+  LayerPtr dataLayer;
+  ParameterPtr para, bias;
+
+  if (layerConfig1.type() == "recurrent") {
+    dataLayer = creatDataLayer("layer_0", batchSize, layerSize, false);
+    para = creatParameter("para_0", 0, layerSize * layerSize, false);
+    bias = nullptr;
+  } else if (layerConfig1.type() == "gated_recurrent") {
+    dataLayer = creatDataLayer("layer_0", batchSize, layerSize * 3, false);
+    para = creatParameter("para_0", 0, layerSize * layerSize * 3, false);
+    bias = creatParameterBias("bias_0", 1, layerSize * 3, false);
+  }
+
+  LayerPtr testLayer1 = initMKLPackedLayer(
+      layerConfig1, reversed, layerSize, dataLayer, para, bias);
+  LayerPtr testLayer2 = initMKLPackedLayer(
+      layerConfig2, reversed, layerSize, dataLayer, para, bias);
+
+  const VectorPtr& weightGrad =
+      (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
+  const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad();
+  CpuVector wgt_grad1(weightGrad->getSize());
+  CpuVector wgt_grad2(weightGrad->getSize());
+  CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth());
+  CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth());
+
+  for (int i = 0; i < 2; i++) {
+    FLAGS_rnn_use_batch = useBatch1;
+
+    testLayer1->forward(PASS_GC);
+
+    FLAGS_rnn_use_batch = useBatch2;
+    testLayer2->forward(PASS_GC);
+
+    testLayer1->getOutputGrad()->randomizeUniform();
+    testLayer2->getOutputGrad()->copyFrom(*testLayer1->getOutputGrad());
+
+    weightGrad->zero();
+    inputGrad->zero();
+    FLAGS_rnn_use_batch = useBatch1;
+    testLayer1->backward(nullptr);
+
+    wgt_grad1.copyFrom(*weightGrad);
+    input_grad1.copyFrom(*inputGrad);
+
+    weightGrad->zero();
+    inputGrad->zero();
+    FLAGS_rnn_use_batch = useBatch2;
+    testLayer2->backward(nullptr);
+
+    wgt_grad2.copyFrom(*weightGrad);
+    input_grad2.copyFrom(*inputGrad);
+
+    checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue());
+    checkError(wgt_grad1, wgt_grad2);
+    checkError(input_grad1, input_grad2);
+  }
+}
+
+TEST(MKLPackedLayer, RecurrentLayer) {
+  LayerConfig layerConfig1;
+  LayerConfig layerConfig2;
+
+  layerConfig1.set_name("paddle-rnn");
+  layerConfig1.set_type("recurrent");
+  layerConfig1.set_active_type("relu");
+
+  layerConfig2.set_name("mkl-packed-rnn");
+  layerConfig2.set_type("mkl_packed_recurrent");
+  layerConfig2.set_active_type("relu");
+
+  FLAGS_use_gpu = false;
+
+  for (auto layerSize : {32, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 100, 500}) {
+      for (auto reversed : {true, false}) {
+        for (auto paddle_use_batch : {true, false}) {
+          for (auto MKLPacked_use_batch : {true, false}) {
+            LOG(INFO) << " layerSize=" << layerSize
+                      << " batchSize=" << batchSize << " reversed=" << reversed
+                      << " paddle_use_batch=" << paddle_use_batch
+                      << " MKLPacked_use_batch=" << MKLPacked_use_batch;
+
+            checkMKLPackedLayer(layerConfig1,
+                                layerConfig2,
+                                reversed,
+                                layerSize,
+                                batchSize,
+                                paddle_use_batch,
+                                MKLPacked_use_batch);
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+
 int main(int argc, char** argv) {
-  if (version::isWithGpu()) {
-    testing::InitGoogleTest(&argc, argv);
-    initMain(argc, argv);
-    return RUN_ALL_TESTS();
-  } else {
-    return 0;
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  if (!version::isWithGpu()) {
+    testing::GTEST_FLAG(filter) = "-Layer.*";
   }
+  return RUN_ALL_TESTS();
 }
diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8437b2b21942ead544dab8636db1b355b7cf7bd5
--- /dev/null
+++ b/paddle/inference/CMakeLists.txt
@@ -0,0 +1,47 @@
+set(FLUID_CORE_MODULES
+    backward proto_desc paddle_memory executor prune init ${GLOB_OP_LIB})
+
+cc_library(paddle_fluid_api
+    SRCS inference.cc
+    DEPS ${FLUID_CORE_MODULES})
+
+# Merge all modules into a simgle static library
+cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES})
+
+# ptools
+# just for testing, we may need to change the storing format for inference_model
+# and move the dependent of pickle.
+# download from http://www.picklingtools.com/
+# build in the C++ sub-directory, using command
+#     make -f Makefile.Linux libptools.so
+set(PTOOLS_LIB)
+set(PTOOLS_ROOT $ENV{PTOOLS_ROOT} CACHE PATH "Folder contains PicklingTools")
+find_path(PTOOLS_INC_DIR chooseser.h PATHS ${PTOOLS_ROOT}/C++)
+find_library(PTOOLS_SHARED_LIB NAMES ptools PATHS ${PTOOLS_ROOT}/C++)
+if(PTOOLS_INC_DIR AND PTOOLS_SHARED_LIB)
+  add_definitions(-DPADDLE_USE_PTOOLS)
+  set(PTOOLS_LIB ptools)
+  message(STATUS "Found PicklingTools: ${PTOOLS_SHARED_LIB}")
+  add_library(${PTOOLS_LIB} SHARED IMPORTED GLOBAL)
+  set_property(TARGET ${PTOOLS_LIB} PROPERTY IMPORTED_LOCATION ${PTOOLS_SHARED_LIB})
+  include_directories(${PTOOLS_ROOT}/C++)
+  include_directories(${PTOOLS_ROOT}/C++/opencontainers_1_8_5/include)
+  add_definitions(-DOC_NEW_STYLE_INCLUDES) # used in ptools
+endif()
+
+add_executable(example example.cc)
+if(APPLE)
+  set(OPTIONAL_LINK_FLAGS)
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
+    set(OPTIONAL_LINK_FLAGS "-undefined dynamic_lookup")
+  endif()
+  target_link_libraries(example
+      -Wl,-force_load paddle_fluid
+      ${OPTIONAL_LINK_FLAGS}
+      ${PTOOLS_LIB})
+else()
+  target_link_libraries(example
+      -Wl,--start-group -Wl,--whole-archive paddle_fluid
+      -Wl,--no-whole-archive -Wl,--end-group
+      ${PTOOLS_LIB})
+endif()
diff --git a/paddle/inference/example.cc b/paddle/inference/example.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9711b20e6fb4099a2cc497029468ebd1fd0b3456
--- /dev/null
+++ b/paddle/inference/example.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <time.h>
+#include <iostream>
+#include "gflags/gflags.h"
+#include "paddle/inference/inference.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_string(feed_var_names, "", "Names of feeding variables");
+DEFINE_string(fetch_var_names, "", "Names of fetching variables");
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_dirname.empty() || FLAGS_feed_var_names.empty() ||
+      FLAGS_fetch_var_names.empty()) {
+    // Example:
+    //   ./example --dirname=recognize_digits_mlp.inference.model
+    //             --feed_var_names="x"
+    //             --fetch_var_names="fc_2.tmp_2"
+    std::cout << "Usage: ./example --dirname=path/to/your/model "
+                 "--feed_var_names=x --fetch_var_names=y"
+              << std::endl;
+    exit(1);
+  }
+
+  std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::cout << "FLAGS_feed_var_names: " << FLAGS_feed_var_names << std::endl;
+  std::cout << "FLAGS_fetch_var_names: " << FLAGS_fetch_var_names << std::endl;
+
+  std::string dirname = FLAGS_dirname;
+  std::vector<std::string> feed_var_names = {FLAGS_feed_var_names};
+  std::vector<std::string> fetch_var_names = {FLAGS_fetch_var_names};
+
+  paddle::InferenceEngine* engine = new paddle::InferenceEngine();
+  engine->LoadInferenceModel(dirname, feed_var_names, fetch_var_names);
+
+  paddle::framework::LoDTensor input;
+  srand(time(0));
+  float* input_ptr =
+      input.mutable_data<float>({1, 784}, paddle::platform::CPUPlace());
+  for (int i = 0; i < 784; ++i) {
+    input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
+  }
+
+  std::vector<paddle::framework::LoDTensor> feeds;
+  feeds.push_back(input);
+  std::vector<paddle::framework::LoDTensor> fetchs;
+  engine->Execute(feeds, fetchs);
+
+  for (size_t i = 0; i < fetchs.size(); ++i) {
+    auto dims_i = fetchs[i].dims();
+    std::cout << "dims_i:";
+    for (int j = 0; j < dims_i.size(); ++j) {
+      std::cout << " " << dims_i[j];
+    }
+    std::cout << std::endl;
+    std::cout << "result:";
+    float* output_ptr = fetchs[i].data<float>();
+    for (int j = 0; j < paddle::framework::product(dims_i); ++j) {
+      std::cout << " " << output_ptr[j];
+    }
+    std::cout << std::endl;
+  }
+
+  delete engine;
+  return 0;
+}
diff --git a/paddle/inference/inference.cc b/paddle/inference/inference.cc
new file mode 100644
index 0000000000000000000000000000000000000000..37b8b20ddfcf2566b8410f950308309e5b2b2a7c
--- /dev/null
+++ b/paddle/inference/inference.cc
@@ -0,0 +1,195 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "inference.h"
+#include <fstream>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/feed_fetch_method.h"
+#include "paddle/framework/init.h"
+#include "paddle/framework/scope.h"
+
+#ifdef PADDLE_USE_PTOOLS
+#include "chooseser.h"
+#endif
+
+namespace paddle {
+
+void InferenceEngine::LoadInferenceModel(
+    const std::string& dirname,
+    const std::vector<std::string>& feed_var_names,
+    const std::vector<std::string>& fetch_var_names) {
+#ifdef PADDLE_USE_PTOOLS
+  std::string model_filename = dirname + "/__model__";
+  LOG(INFO) << "Using PicklingTools, loading model from " << model_filename;
+  Val v;
+  LoadValFromFile(model_filename.c_str(), v, SERIALIZE_P0);
+  std::string program_desc_str = v["program_desc_str"];
+  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
+// PicklingTools cannot parse the vector of strings correctly.
+#else
+  std::string model_filename = dirname + "/__model__.dat";
+  LOG(INFO) << "loading model from " << model_filename;
+  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
+  std::string program_desc_str;
+  inputfs.seekg(0, std::ios::end);
+  program_desc_str.resize(inputfs.tellg());
+  inputfs.seekg(0, std::ios::beg);
+  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
+  inputfs.read(&program_desc_str[0], program_desc_str.size());
+  inputfs.close();
+#endif
+  program_ = new framework::ProgramDesc(program_desc_str);
+  GenerateLoadProgram(dirname);
+
+  if (feed_var_names.empty() || fetch_var_names.empty()) {
+    LOG(FATAL) << "Please specify the feed_var_names and fetch_var_names.";
+  }
+  feed_var_names_ = feed_var_names;
+  fetch_var_names_ = fetch_var_names;
+  PrependFeedOp();
+  AppendFetchOp();
+}
+
+bool InferenceEngine::IsParameter(const framework::VarDesc* var) {
+  if (var->Persistable()) {
+    // There are many unreachable variables in the program
+    for (size_t i = 0; i < program_->Size(); ++i) {
+      const framework::BlockDesc& block = program_->Block(i);
+      for (auto* op : block.AllOps()) {
+        for (auto input_argument_name : op->InputArgumentNames()) {
+          if (input_argument_name == var->Name()) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+void InferenceEngine::GenerateLoadProgram(const std::string& dirname) {
+  framework::BlockDesc* global_block = program_->MutableBlock(0);
+
+  load_program_ = new framework::ProgramDesc();
+  framework::BlockDesc* load_block = load_program_->MutableBlock(0);
+  for (auto* var : global_block->AllVars()) {
+    if (IsParameter(var)) {
+      LOG(INFO) << "parameter's name: " << var->Name();
+
+      framework::VarDesc* new_var = load_block->Var(var->Name());
+      new_var->SetShape(var->Shape());
+      new_var->SetDataType(var->GetDataType());
+      new_var->SetType(var->GetType());
+      new_var->SetLoDLevel(var->GetLoDLevel());
+      new_var->SetPersistable(true);
+
+      // append_op
+      framework::OpDesc* op = load_block->AppendOp();
+      op->SetType("load");
+      op->SetOutput("Out", {new_var->Name()});
+      op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
+      op->CheckAttrs();
+    }
+  }
+}
+
+void InferenceEngine::PrependFeedOp() {
+  if (!program_) {
+    LOG(FATAL) << "Please initialize the program_ first.";
+  }
+
+  framework::BlockDesc* global_block = program_->MutableBlock(0);
+
+  // create_var
+  framework::VarDesc* feed_var = global_block->Var("feed");
+  feed_var->SetType(framework::proto::VarDesc::FEED_MINIBATCH);
+  feed_var->SetPersistable(true);
+
+  // prepend feed_op
+  for (size_t i = 0; i < feed_var_names_.size(); ++i) {
+    std::string var_name = feed_var_names_[i];
+    LOG(INFO) << "feed var's name: " << var_name;
+
+    // prepend_op
+    framework::OpDesc* op = global_block->PrependOp();
+    op->SetType("feed");
+    op->SetInput("X", {"feed"});
+    op->SetOutput("Out", {var_name});
+    op->SetAttr("col", {static_cast<int>(i)});
+    op->CheckAttrs();
+  }
+}
+
+void InferenceEngine::AppendFetchOp() {
+  if (!program_) {
+    LOG(FATAL) << "Please initialize the program_ first.";
+  }
+
+  framework::BlockDesc* global_block = program_->MutableBlock(0);
+
+  // create_var
+  framework::VarDesc* fetch_var = global_block->Var("fetch");
+  fetch_var->SetType(framework::proto::VarDesc::FETCH_LIST);
+  fetch_var->SetPersistable(true);
+
+  // append fetch_op
+  for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
+    std::string var_name = fetch_var_names_[i];
+    LOG(INFO) << "fetch var's name: " << var_name;
+
+    // append_op
+    framework::OpDesc* op = global_block->AppendOp();
+    op->SetType("fetch");
+    op->SetInput("X", {var_name});
+    op->SetOutput("Out", {"fetch"});
+    op->SetAttr("col", {static_cast<int>(i)});
+    op->CheckAttrs();
+  }
+}
+
+void InferenceEngine::Execute(const std::vector<framework::LoDTensor>& feeds,
+                              std::vector<framework::LoDTensor>& fetchs) {
+  if (!program_ || !load_program_) {
+    LOG(FATAL) << "Please initialize the program_ and load_program_ first.";
+  }
+
+  if (feeds.size() < feed_var_names_.size()) {
+    LOG(FATAL) << "Please feed " << feed_var_names_.size() << " input Tensors.";
+  }
+
+  auto* place = new platform::CPUPlace();
+  framework::InitDevices();
+  framework::Executor* executor = new framework::Executor(*place);
+  framework::Scope* scope = new framework::Scope();
+
+  executor->Run(*load_program_, scope, 0, true, true);
+
+  // set_feed_variable
+  for (size_t i = 0; i < feed_var_names_.size(); ++i) {
+    framework::SetFeedVariable(scope, feeds[i], "feed", i);
+  }
+
+  executor->Run(*program_, scope, 0, true, true);
+
+  // get_fetch_variable
+  fetchs.resize(fetch_var_names_.size());
+  for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
+    fetchs[i] = framework::GetFetchVariable(*scope, "fetch", i);
+  }
+
+  delete place;
+  delete scope;
+  delete executor;
+}
+}  // namespace paddle
diff --git a/paddle/inference/inference.h b/paddle/inference/inference.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3f3ef4b440036a0b27353cc092eed1bbf96eeb3
--- /dev/null
+++ b/paddle/inference/inference.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/program_desc.h"
+
+namespace paddle {
+
+class InferenceEngine {
+public:
+  InferenceEngine() : program_(nullptr), load_program_(nullptr) {}
+  ~InferenceEngine() {
+    delete program_;
+    delete load_program_;
+  }
+
+  void LoadInferenceModel(const std::string& dirname,
+                          const std::vector<std::string>& feed_var_names,
+                          const std::vector<std::string>& fetch_var_names);
+  void Execute(const std::vector<framework::LoDTensor>& feeds,
+               std::vector<framework::LoDTensor>& fetchs);
+
+private:
+  bool IsParameter(const framework::VarDesc* var);
+  void GenerateLoadProgram(const std::string& dirname);
+  void PrependFeedOp();
+  void AppendFetchOp();
+
+private:
+  framework::ProgramDesc* program_;
+  framework::ProgramDesc* load_program_;
+  std::vector<std::string> feed_var_names_;
+  std::vector<std::string> fetch_var_names_;
+};
+
+}  // namespace paddle
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index 94ef561f066a127496e2849a419835e175c526d7..17563bf5e1649361b83b896bf864b922296a5487 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -48,7 +48,7 @@ public:
    */
   virtual void* alloc(size_t size) {
     void* ptr;
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
     // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
     // memory alignment
     CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index 86bb270a4372841b3e6f4676e222d2190549c153..922fb5172273da24f9c48786961a6d850b1ed7c5 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -26,8 +26,6 @@ else()
 endif()
 
 if(MOBILE_INFERENCE)
-    list(REMOVE_ITEM MATH_SOURCES
-         ${CMAKE_CURRENT_SOURCE_DIR}/SIMDFunctions.cpp)
     # Remove sparse
     list(REMOVE_ITEM MATH_HEADERS
          ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index bf62229c03bb1d6e2bdf86d8c56a8157938fb832..dc6979cf5a5229fb09866189f28217889d58c2d0 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -260,6 +260,35 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
   os << ";";
 }
 
+void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK_EQ(height_, b.getHeight());
+  CHECK_EQ(width_, b.getWidth());
+  real* A = getValue();
+  real* B = b.getValue();
+  if (b.getValueType() == FLOAT_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = B[j] * c.getElement(i, cCol);
+      }
+    }
+  } else if (b.getValueType() == NO_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = c.getElement(i, cCol);
+      }
+    }
+  }
+}
+
 void CpuSparseMatrix::randomizeUniform() {
   CHECK_LE(elementCnt_, height_ * width_);
   if (valueType_ == FLOAT_VALUE) {
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index aad1348353d558abca72ed0fa5cf943237e3ac78..522b436a2a69179d3f4f17c919d5ba024102db7b 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -239,6 +239,15 @@ public:
               const unsigned int* cols,
               const real* values);
 
+  /**
+   * @brief this_row = b_row * c_row[cCol]
+   *
+   * @param[in]  cCol   the column of matrix c used to scale each row of b
+   * @param[in]  b      CpuSparseMatrix
+   * @param[in]  c      Matrix
+   */
+  void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c);
+
   void randomizeUniform();
 
   void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index ba86eacbb5d53ee43a60d2cd1dd922333a5d48f0..28ab54b450c96b4bdefdf36813595766162b1434 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -206,7 +206,7 @@ double dotProduct<double>(const int n, const double* x, const double* y) {
 }
 #endif
 
-#if defined(PADDLE_USE_MKLML)
+#if defined(PADDLE_WITH_MKLML)
 
 template <>
 void vExp<float>(const int n, const float* a, float* r) {
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index f6e77029bdd75a602f88b688ca810f47ba4ee615..29fe36e3a4bd5e5d372480950a03142822262d41 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifndef MATHFUNCTIONS_H_
 #define MATHFUNCTIONS_H_
 
-#ifdef PADDLE_USE_MKLML
+#ifdef PADDLE_WITH_MKLML
 #include <mkl_cblas.h>
 #include <mkl_lapacke.h>
 #include <mkl_vml_functions.h>
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 88e9180690606c92cf46c5b295d80f14e5d64567..1ec4336cabbc7d3073b7638b7484bf61e83a2dc5 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "hl_top_k.h"
 #include "paddle/utils/Logging.h"
 
+#include "NEONFunctions.h"
 #include "paddle/function/GemmFunctor.h"
 #include "paddle/utils/ThreadLocal.h"
 
@@ -1130,7 +1131,8 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat,
                                size_t outputH,
                                size_t outputW,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               bool excludeMode) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
   real* inputData = inputMat.getData();
@@ -1153,7 +1155,8 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat,
                      paddingH,
                      paddingW,
                      data_,
-                     getStride());
+                     getStride(),
+                     excludeMode);
 }
 
 void GpuMatrix::avgPoolBackward(Matrix& outGrad,
@@ -1168,7 +1171,8 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
                                 real scaleTargets,
                                 real scaleOutput,
                                 size_t paddingH,
-                                size_t paddingW) {
+                                size_t paddingW,
+                                bool excludeMode) {
   CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
 
   real* outDiff = outGrad.getData();
@@ -1194,7 +1198,8 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
                       scaleTargets,
                       scaleOutput,
                       data_,
-                      outGrad.getStride());
+                      outGrad.getStride(),
+                      excludeMode);
 }
 
 void GpuMatrix::maxPool3DForward(Matrix& inputMat,
@@ -2136,7 +2141,8 @@ void CpuMatrix::avgPoolForward(Matrix& input,
                                size_t outputH,
                                size_t outputW,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               bool excludeMode) {
   // The main loop
   size_t num = input.getHeight();
   size_t inLength = imgSizeH * imgSizeW;
@@ -2165,7 +2171,8 @@ void CpuMatrix::avgPoolForward(Matrix& input,
               tgtData[ph * outputW + pw] += inData[h * imgSizeW + w];
             }
           }
-          int poolSize = (hend - hstart) * (wend - wstart);
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
           CHECK(poolSize);
           tgtData[ph * outputW + pw] /= poolSize;
         }
@@ -2189,7 +2196,8 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
                                 real scaleTargets,
                                 real scaleOutput,
                                 size_t paddingH,
-                                size_t paddingW) {
+                                size_t paddingW,
+                                bool excludeMode) {
   size_t num = input.getHeight();
   size_t channels = input.getWidth() / outputH / outputW;
   size_t inLength = imgSizeH * imgSizeW;
@@ -2211,7 +2219,8 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
           int wstart = pw * strideW - paddingW;
           int wend = std::min(wstart + sizeX, imgSizeW);
           wstart = std::max(wstart, 0);
-          int poolSize = (hend - hstart) * (wend - wstart);
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
           CHECK(poolSize);
 
           for (int h = hstart; h < hend; ++h) {
@@ -4157,16 +4166,36 @@ void CpuMatrix::print(std::ostream& os) const {
 void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   real* input = data.getData();
   real* w = W.getData();
+  real* output = data_;
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
   size_t paraSize = W.getHeight() * W.getWidth();
   CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+
   size_t partial_sum = numElements / paraSize;
+  if (paraSize == numElements) {
+    for (size_t n = 0; n < numSamples * numElements; ++n) {
+      output[n] = input[n] > 0 ? input[n] : input[n] * w[n % numElements];
+    }
+    return;
+  }
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  for (size_t n = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < paraSize; i++) {
+      neon::prelu(
+          input + i * partial_sum, w[i], output + i * partial_sum, partial_sum);
+    }
+    input = input + numElements;
+    output = output + numElements;
+  }
+#else
   for (size_t n = 0, k = 0; n < numSamples; ++n) {
     for (size_t i = 0; i < numElements; ++i, ++k) {
-      data_[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
+      output[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
     }
   }
+#endif
 }
 
 void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index e273f1123690e31984c97185c5a8bc5e7b92c38c..c8e690e6421668bdade4e50a61882c915b2ddc7c 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -911,7 +911,8 @@ public:
                               size_t outputH,
                               size_t outputW,
                               size_t paddingH,
-                              size_t paddingW) {
+                              size_t paddingW,
+                              bool excludeMode = true) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -927,9 +928,11 @@ public:
                                real scaleTargets,
                                real scaleOutput,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               bool excludeMode = true) {
     LOG(FATAL) << "Not implemeted";
   }
+
   /**
    * Pooling 3D forward operation, pick out the largest element
    * in the sizeX of value
@@ -1458,7 +1461,8 @@ public:
                       size_t outputH,
                       size_t outputW,
                       size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      bool excludeMode = true);
 
   void avgPoolBackward(Matrix& input,
                        size_t imgSizeH,
@@ -1472,7 +1476,8 @@ public:
                        real scaleTargets,
                        real scaleOutput,
                        size_t paddingH,
-                       size_t paddingW);
+                       size_t paddingW,
+                       bool excludeMode = true);
 
   void maxPool3DForward(Matrix& inputMat,
                         Matrix& maxPoolIdx,
@@ -1730,7 +1735,8 @@ public:
                       size_t outputH,
                       size_t outputW,
                       size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      bool excludeMode = true);
 
   void avgPoolBackward(Matrix& input,
                        size_t imgSizeH,
@@ -1744,7 +1750,8 @@ public:
                        real scaleTargets,
                        real scaleOutput,
                        size_t paddingH,
-                       size_t paddingW);
+                       size_t paddingW,
+                       bool excludeMode = true);
 
   void maxPool3DForward(Matrix& inputMat,
                         Matrix& maxPoolIdx,
diff --git a/paddle/math/NEONFunctions.cpp b/paddle/math/NEONFunctions.cpp
index 3bf47901f1069ac228fa1b877e29848d8cc130e8..0f8314942290a71dd327437b8a6da2d64fe48444 100644
--- a/paddle/math/NEONFunctions.cpp
+++ b/paddle/math/NEONFunctions.cpp
@@ -49,6 +49,46 @@ void relu(const float* a, float* b, int len) {
   }
 }
 
+// b[i] = a[i] > 0.0f ? a[i] : a[i] * w
+void prelu(const float* a, float w, float* b, int len) {
+  int offset = len % 16;
+  float32x4_t ma0, ma1, ma2, ma3;
+
+  float32x4_t zero = vdupq_n_f32(0.f);
+  float32x4_t vw = vdupq_n_f32(w);
+
+  for (int k = 0; k < len / 16; k++, a += 16, b += 16) {
+    ma0 = vld1q_f32(a);
+    ma1 = vld1q_f32(a + 4);
+    ma2 = vld1q_f32(a + 8);
+    ma3 = vld1q_f32(a + 12);
+
+    uint32x4_t flag0 = vcgtq_f32(ma0, zero);
+    uint32x4_t flag1 = vcgtq_f32(ma1, zero);
+    uint32x4_t flag2 = vcgtq_f32(ma2, zero);
+    uint32x4_t flag3 = vcgtq_f32(ma3, zero);
+
+    float32x4_t mul0 = vmulq_f32(ma0, vw);
+    float32x4_t mul1 = vmulq_f32(ma1, vw);
+    float32x4_t mul2 = vmulq_f32(ma2, vw);
+    float32x4_t mul3 = vmulq_f32(ma3, vw);
+
+    ma0 = vbslq_f32(flag0, ma0, mul0);
+    ma1 = vbslq_f32(flag1, ma1, mul1);
+    ma2 = vbslq_f32(flag2, ma2, mul2);
+    ma3 = vbslq_f32(flag3, ma3, mul3);
+
+    vst1q_f32(b, ma0);
+    vst1q_f32(b + 4, ma1);
+    vst1q_f32(b + 8, ma2);
+    vst1q_f32(b + 12, ma3);
+  }
+
+  for (int i = 0; i < offset; i++) {
+    b[i] = a[i] > 0.0f ? a[i] : a[i] * w;
+  }
+}
+
 }  // namespace neon
 }  // namespace paddle
 
diff --git a/paddle/math/NEONFunctions.h b/paddle/math/NEONFunctions.h
index 69085e333547a31a341fbfde247f1e30adb957ee..d67b2f47a85a963949d23415e4f6881658203bb7 100644
--- a/paddle/math/NEONFunctions.h
+++ b/paddle/math/NEONFunctions.h
@@ -18,6 +18,7 @@ namespace paddle {
 namespace neon {
 
 void relu(const float* a, float* b, int len);
+void prelu(const float* a, float w, float* b, int len);
 
 }  // namespace neon
 }  // namespace paddle
diff --git a/paddle/math/SIMDFunctions.h b/paddle/math/SIMDFunctions.h
index 439f11b79d134d7054f45f2d0a70fc5a6fde6c13..76909720f6aef0eea7cdf0dfe618237403d52c99 100644
--- a/paddle/math/SIMDFunctions.h
+++ b/paddle/math/SIMDFunctions.h
@@ -116,9 +116,11 @@ inline bool vec_check(size_t len) {
 }
 
 namespace internal {
+#ifdef __SSE3__
 void addToImpl(float* a, const float* b, size_t len);
 void batchAddToImpl(float* a, const float* b[], int batch, size_t len);
 void colMaxImpl(float* result, const float* data, int dim, int numSamples);
+#endif
 #ifdef __AVX__
 void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len);
 void decayL1AvxImpl(
diff --git a/paddle/math/float16.h b/paddle/math/float16.h
new file mode 100644
index 0000000000000000000000000000000000000000..efebbce50405018c6b7ce2049f8d55c33680469f
--- /dev/null
+++ b/paddle/math/float16.h
@@ -0,0 +1,739 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif  // PADDLE_WITH_CUDA
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+#include "paddle/platform/hostdevice.h"
+
+#ifdef __GNUC__
+#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
+#else
+#define PADDLE_GNUC_VER 0
+#endif  // __GNUC__
+
+#ifdef __clang__
+#define PADDLE_CLANG_VER (__clang_major__ * 10 + __clang_minor__)
+#else
+#define PADDLE_CLANG_VER 0
+#endif  // __clang__
+
+#if defined(__CUDACC__) && CUDA_VERSION >= 7050
+#define PADDLE_CUDA_FP16
+#include <cuda_fp16.h>
+#endif
+
+#if defined(__arm__) || defined(__aarch64__)
+#define PADDLE_ARM
+#endif
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#define PADDLE_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
+    (PADDLE_GNUC_VER >= 62 || PADDLE_CLANG_VER >= 37)
+#define PADDLE_WITH_NATIVE_FP16
+#endif
+
+#ifndef PADDLE_ARM
+#include <immintrin.h>
+#endif  // PADDLE_ARM
+
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+
+namespace paddle {
+
+// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
+// and aligned at least on a 2-byte boundary, which leads to efficient
+// memory access of float16 struct and also makes float16 compatible
+// with CUDA half, ARM float16_t, and Eigen::half data types.
+struct PADDLE_ALIGN(2) float16 {
+public:
+  uint16_t x;
+
+  // Constructors
+  HOSTDEVICE inline float16() : x(0) {}
+
+  HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
+
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline explicit float16(const half& h) {
+#if CUDA_VERSION >= 9000
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
+#else
+    x = h.x;
+#endif  // CUDA_VERSION >= 9000
+  }
+#endif  // PADDLE_CUDA_FP16
+
+  HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {}
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  // __fp16 is a native half precision data type for arm cpu,
+  // float16_t is an alias for __fp16
+  HOSTDEVICE inline explicit float16(const float16_t& h) {
+    x = *reinterpret_cast<const uint16_t*>(&h);
+  }
+#endif
+
+  HOSTDEVICE inline explicit float16(float val) {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    half tmp = __float2half(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
+
+#elif defined(PADDLE_WITH_NATIVE_FP16)
+    float32x4_t tmp = vld1q_dup_f32(&val);
+    float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
+    x = *reinterpret_cast<uint16_t*>(&res);
+
+#elif defined(__F16C__)
+    x = _cvtss_sh(val, 0);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v, s;
+    v.f = val;
+    uint32_t sign = v.si & sigN;
+    v.si ^= sign;
+    sign >>= shiftSign;  // logical shift
+    s.si = mulN;
+    s.si = s.f * v.f;  // correct subnormals
+    v.si ^= (s.si ^ v.si) & -(minN > v.si);
+    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+    v.ui >>= shift;  // logical shift
+    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+    x = v.ui | sign;
+
+#endif
+  }
+
+  HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
+
+  template <class T>
+  HOSTDEVICE inline explicit float16(const T& val)
+      : x(float16(static_cast<float>(val)).x) {}
+
+  HOSTDEVICE inline float16& operator=(const float16& rhs) {
+    x = rhs.x;
+    return *this;
+  }
+
+// Assignment operators
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline float16& operator=(const half& rhs) {
+#if CUDA_VERSION >= 9000
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
+#else
+    x = rhs.x;
+#endif
+    return *this;
+  }
+#endif
+
+  HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
+    x = rhs.x;
+    return *this;
+  }
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
+    x = *reinterpret_cast<const uint16_t*>(&rhs);
+    return *this;
+  }
+#endif
+
+  HOSTDEVICE inline float16& operator=(bool b) {
+    x = b ? 0x3c00 : 0;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(float val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(double val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+// Conversion opertors
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline explicit operator half() const {
+#if CUDA_VERSION >= 9000
+    __half_raw h;
+    h.x = x;
+    return half(h);
+#else
+    half h;
+    h.x = x;
+    return h;
+#endif  // CUDA_VERSION >= 9000
+  }
+#endif  // PADDLE_CUDA_FP16
+
+  HOSTDEVICE inline explicit operator Eigen::half() const {
+    Eigen::half h;
+    h.x = x;
+    return h;
+  }
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline explicit operator float16_t() const {
+    return *reinterpret_cast<const float16_t*>(this);
+  }
+#endif
+
+  HOSTDEVICE inline explicit operator float() const {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    half tmp = *reinterpret_cast<const half*>(this);
+    return __half2float(tmp);
+
+#elif defined(PADDLE_WITH_NATIVE_FP16)
+    float16x4_t res = vld1_dup_f16(reinterpret_cast<const float16_t*>(this));
+    return vgetq_lane_f32(vcvt_f32_f16(res), 0);
+
+#elif defined(__F16C__)
+    return _cvtsh_ss(this->x);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v;
+    v.ui = this->x;
+    int32_t sign = v.si & sigC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+
+#endif
+  }
+
+  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(float(*this));
+  }
+
+private:
+  union Bits {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  };
+
+  static const int shift = 13;
+  static const int shiftSign = 16;
+
+  static const int32_t infN = 0x7F800000;
+  static const int32_t maxN = 0x477FE000;  // max flt16 as flt32
+  static const int32_t minN = 0x38800000;  // min flt16 normal as flt32
+  static const int32_t sigN = 0x80000000;  // sign bit
+
+  static constexpr int32_t infC = infN >> shift;
+  static constexpr int32_t nanN = (infC + 1)
+                                  << shift;  // minimum flt16 nan as float32
+  static constexpr int32_t maxC = maxN >> shift;
+  static constexpr int32_t minC = minN >> shift;
+  static constexpr int32_t sigC = sigN >> shiftSign;
+
+  static const int32_t mulN = 0x52000000;  // (1 << 23) / minN
+  static const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
+  static const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
+  static const int32_t norC = 0x00400;     // min flt32 normal downshifted
+
+  static constexpr int32_t maxD = infC - maxC - 1;
+  static constexpr int32_t minD = minC - subC - 1;
+};
+
+// Arithmetic operators on GPU
+// CUDA 9.0 provides built-in arithmetic operators for half while
+// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
+// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
+// CUDA 9.0 regarding the half data type.
+#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
+
+DEVICE inline half operator+(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hadd(a, b);
+#else
+  float res = float(float16(a)) + float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator-(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hsub(a, b);
+#else
+  float res = float(float16(a)) - float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator*(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hmul(a, b);
+#else
+  float res = float(float16(a)) * float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator/(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+  float num = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
+#else
+  float res = float(float16(a)) / float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator-(const half& a) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hneg(a);
+#else
+  float res = -float(float16(a));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half& operator+=(half& a, const half& b) {
+  a = a + b;
+  return a;
+}
+
+DEVICE inline half& operator-=(half& a, const half& b) {
+  a = a - b;
+  return a;
+}
+
+DEVICE inline half& operator*=(half& a, const half& b) {
+  a = a * b;
+  return a;
+}
+
+DEVICE inline half& operator/=(half& a, const half& b) {
+  a = a / b;
+  return a;
+}
+
+DEVICE inline bool operator==(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __heq(a, b);
+#else
+  return float(float16(a)) == float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator!=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hne(a, b);
+#else
+  return float(float16(a)) != float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator<(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(a, b);
+#else
+  return float(float16(a)) < float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator<=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hle(a, b);
+#else
+  return float(float16(a)) <= float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator>(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hgt(a, b);
+#else
+  return float(float16(a)) > float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator>=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hge(a, b);
+#else
+  return float(float16(a)) >= float(float16(b));
+#endif
+}
+
+#endif  // PADDLE_CUDA_FP16
+
+// Arithmetic operators on ARMv8.2-A CPU
+#if defined(PADDLE_WITH_NATIVE_FP16)
+HOST inline float16 operator+(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fadd h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator-(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fsub h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator*(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fmul h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator/(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fdiv h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator-(const float16& a) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "fneg h0, h0\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0");
+  return res;
+}
+
+HOST inline float16& operator+=(float16& a, const float16& b) {
+  a = a + b;
+  return a;
+}
+
+HOST inline float16& operator-=(float16& a, const float16& b) {
+  a = a - b;
+  return a;
+}
+
+HOST inline float16& operator*=(float16& a, const float16& b) {
+  a = a * b;
+  return a;
+}
+
+HOST inline float16& operator/=(float16& a, const float16& b) {
+  a = a / b;
+  return a;
+}
+
+HOST inline bool operator==(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmeq h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator!=(const float16& a, const float16& b) {
+  return !(a == b);
+}
+
+HOST inline bool operator<(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator<=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator>(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator>=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+// Arithmetic operators, software emulated on other CPU
+#else
+HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
+  return float16(float(a) + float(b));
+}
+
+HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
+  return float16(float(a) - float(b));
+}
+
+HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
+  return float16(float(a) * float(b));
+}
+
+HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
+  return float16(float(a) / float(b));
+}
+
+HOSTDEVICE inline float16 operator-(const float16& a) {
+  float16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+
+HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
+  a = float16(float(a) + float(b));
+  return a;
+}
+
+HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
+  a = float16(float(a) - float(b));
+  return a;
+}
+
+HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
+  a = float16(float(a) * float(b));
+  return a;
+}
+
+HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
+  a = float16(float(a) / float(b));
+  return a;
+}
+
+HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
+  return float(a) == float(b);
+}
+
+HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
+  return float(a) != float(b);
+}
+
+HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
+  return float(a) < float(b);
+}
+
+HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
+  return float(a) <= float(b);
+}
+
+HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
+  return float(a) > float(b);
+}
+
+HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
+  return float(a) >= float(b);
+}
+#endif
+}  // namespace paddle
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index d8b7f9e3fc74040189ade83049e4a1c3348e08de..dcd2a34583417993a4bf2976f7a3bc5a10d496ac 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -22,6 +22,7 @@ if(WITH_GPU)
     link_paddle_test(test_Tensor)
     CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
     link_paddle_test(test_lazyAssign)
+    nv_test(test_float16_gpu SRCS test_float16.cu)
 else()
     compile_cu_as_cpp(test_Tensor.cu)
     add_unittest(test_Tensor test_Tensor.cu)
@@ -33,3 +34,4 @@ add_simple_unittest(test_FPException)
 add_simple_unittest(test_GpuProfiler)
 add_simple_unittest(test_BaseMatrix)
 add_simple_unittest(test_Matrix)
+add_simple_unittest(test_float16)
diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..74cc55aa3792f5e9f86b4f56f28dad97f35996a0
--- /dev/null
+++ b/paddle/math/tests/test_float16.cpp
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/math/float16.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+TEST(float16, conversion_cpu) {
+  // Explicit conversion from Eigen::half
+  EXPECT_EQ(float16(Eigen::half(1.0f)).x, 0x3c00);
+  EXPECT_EQ(float16(Eigen::half(0.5f)).x, 0x3800);
+  EXPECT_EQ(float16(Eigen::half(0.33333f)).x, 0x3555);
+  EXPECT_EQ(float16(Eigen::half(0.0f)).x, 0x0000);
+  EXPECT_EQ(float16(Eigen::half(-0.0f)).x, 0x8000);
+  EXPECT_EQ(float16(Eigen::half(65504.0f)).x, 0x7bff);
+  EXPECT_EQ(float16(Eigen::half(65536.0f)).x, 0x7c00);
+
+  // Conversion from float
+  EXPECT_EQ(float16(1.0f).x, 0x3c00);
+  EXPECT_EQ(float16(0.5f).x, 0x3800);
+  EXPECT_EQ(float16(0.33333f).x, 0x3555);
+  EXPECT_EQ(float16(0.0f).x, 0x0000);
+  EXPECT_EQ(float16(-0.0f).x, 0x8000);
+  EXPECT_EQ(float16(65504.0f).x, 0x7bff);
+  EXPECT_EQ(float16(65536.0f).x, 0x7c00);
+
+  // Conversion from double
+  EXPECT_EQ(float16(1.0).x, 0x3c00);
+  EXPECT_EQ(float16(0.5).x, 0x3800);
+  EXPECT_EQ(float16(0.33333).x, 0x3555);
+  EXPECT_EQ(float16(0.0).x, 0x0000);
+  EXPECT_EQ(float16(-0.0).x, 0x8000);
+  EXPECT_EQ(float16(65504.0).x, 0x7bff);
+  EXPECT_EQ(float16(65536.0).x, 0x7c00);
+
+  // Conversion from int
+  EXPECT_EQ(float16(-1).x, 0xbc00);
+  EXPECT_EQ(float16(0).x, 0x0000);
+  EXPECT_EQ(float16(1).x, 0x3c00);
+  EXPECT_EQ(float16(2).x, 0x4000);
+  EXPECT_EQ(float16(3).x, 0x4200);
+
+  // Conversion from bool
+  EXPECT_EQ(float16(true).x, 0x3c00);
+  EXPECT_EQ(float16(false).x, 0x0000);
+
+  // Default constructor
+  float16 v_def;
+  EXPECT_EQ(v_def.x, 0x0000);
+
+  // Assignment operator
+  float16 v_assign;
+  v_assign = v_def;
+  EXPECT_EQ(v_assign.x, 0x0000);
+  v_assign = Eigen::half(1.0f);
+  EXPECT_EQ(v_assign.x, 0x3c00);
+  v_assign = 0.5f;
+  EXPECT_EQ(v_assign.x, 0x3800);
+  v_assign = 0.33333;
+  EXPECT_EQ(v_assign.x, 0x3555);
+  v_assign = -1;
+  EXPECT_EQ(v_assign.x, 0xbc00);
+  v_assign = true;
+  EXPECT_EQ(v_assign.x, 0x3c00);
+
+  // Conversion operator
+  EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
+  EXPECT_EQ(float(float16(0.5f)), 0.5f);
+  EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001);
+  EXPECT_EQ(int(float16(-1)), -1);
+  EXPECT_EQ(bool(float16(true)), true);
+}
+
+TEST(float16, arithmetic_cpu) {
+  EXPECT_EQ(float(float16(1) + float16(1)), 2);
+  EXPECT_EQ(float(float16(5) + float16(-5)), 0);
+  EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001);
+  EXPECT_EQ(float(float16(3) - float16(5)), -2);
+  EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001);
+  EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
+  EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
+  EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001);
+  EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f);
+  EXPECT_EQ(float(-float16(512.0f)), -512.0f);
+  EXPECT_EQ(float(-float16(-512.0f)), 512.0f);
+}
+
+TEST(float16, comparison_cpu) {
+  EXPECT_TRUE(float16(1.0f) == float16(1.0f));
+  EXPECT_FALSE(float16(-1.0f) == float16(-0.5f));
+  EXPECT_TRUE(float16(1.0f) != float16(0.5f));
+  EXPECT_FALSE(float16(-1.0f) != float16(-1.0f));
+  EXPECT_TRUE(float16(1.0f) < float16(2.0f));
+  EXPECT_FALSE(float16(-1.0f) < float16(-1.0f));
+  EXPECT_TRUE(float16(1.0f) <= float16(1.0f));
+  EXPECT_TRUE(float16(2.0f) > float16(1.0f));
+  EXPECT_FALSE(float16(-2.0f) > float16(-2.0f));
+  EXPECT_TRUE(float16(2.0f) >= float16(2.0f));
+
+  EXPECT_TRUE(float16(0.0f) == float16(-0.0f));
+  EXPECT_TRUE(float16(0.0f) <= float16(-0.0f));
+  EXPECT_TRUE(float16(0.0f) >= float16(-0.0f));
+  EXPECT_FALSE(float16(0.0f) < float16(-0.0f));
+  EXPECT_FALSE(float16(-0.0f) < float16(0.0f));
+  EXPECT_FALSE(float16(0.0f) > float16(-0.0f));
+  EXPECT_FALSE(float16(-0.0f) > float16(0.0f));
+}
+
+}  // namespace paddle
diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4b520feaaf552302a969d8caee8aa28cc143304b
--- /dev/null
+++ b/paddle/math/tests/test_float16.cu
@@ -0,0 +1,213 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/math/float16.h"
+
+#include <gtest/gtest.h>
+
+#include "paddle/utils/Logging.h"
+
+#define ARITHMETIC_KERNEL(op_type, sign)                                 \
+  __global__ void op_type(const half* in1, const half* in2, half* out) { \
+    out[0] = in1[0] sign in2[0];                                         \
+  }
+
+#define COMPOUND_KERNEL(op_type, sign) \
+  __global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; }
+
+#define COMPARISON_KERNEL(op_type, sign)                                 \
+  __global__ void op_type(const half* in1, const half* in2, bool* out) { \
+    out[0] = in1[0] sign in2[0];                                         \
+  }
+
+#define ARITHMETIC_KERNEL_LAUNCH(op_type)                     \
+  void Test##op_type(float v_in1, float v_in2, float v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
+    half *in1, *in2, *out;                                    \
+    half *d_in1, *d_in2, *d_out;                              \
+    int size = sizeof(half);                                  \
+    cudaMalloc((void**)&d_in1, size);                         \
+    cudaMalloc((void**)&d_in2, size);                         \
+    cudaMalloc((void**)&d_out, size);                         \
+    in1 = (half*)malloc(size);                                \
+    in2 = (half*)malloc(size);                                \
+    out = (half*)malloc(size);                                \
+    in1[0] = half(float16(v_in1));                            \
+    in2[0] = half(float16(v_in2));                            \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
+    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                   \
+    cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);     \
+    EXPECT_EQ(float(float16(out[0])), v_out);                 \
+    free(in1);                                                \
+    free(in2);                                                \
+    free(out);                                                \
+    cudaFree(d_in1);                                          \
+    cudaFree(d_in2);                                          \
+    cudaFree(d_out);                                          \
+  }
+
+#define COMPOUND_KERNEL_LAUNCH(op_type)                       \
+  void Test##op_type(float v_in1, float v_in2, float v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
+    half *in1, *in2;                                          \
+    half *d_in1, *d_in2;                                      \
+    int size = sizeof(half);                                  \
+    cudaMalloc((void**)&d_in1, size);                         \
+    cudaMalloc((void**)&d_in2, size);                         \
+    in1 = (half*)malloc(size);                                \
+    in2 = (half*)malloc(size);                                \
+    in1[0] = half(float16(v_in1));                            \
+    in2[0] = half(float16(v_in2));                            \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
+    op_type<<<1, 1>>>(d_in1, d_in2);                          \
+    cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost);     \
+    EXPECT_EQ(float(float16(in1[0])), v_out);                 \
+    free(in1);                                                \
+    free(in2);                                                \
+    cudaFree(d_in1);                                          \
+    cudaFree(d_in2);                                          \
+  }
+
+#define COMPARISON_KERNEL_LAUNCH(op_type)                    \
+  void Test##op_type(float v_in1, float v_in2, bool v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";          \
+    half *in1, *in2;                                         \
+    half *d_in1, *d_in2;                                     \
+    bool *out, *d_out;                                       \
+    int size = sizeof(half);                                 \
+    cudaMalloc((void**)&d_in1, size);                        \
+    cudaMalloc((void**)&d_in2, size);                        \
+    cudaMalloc((void**)&d_out, 1);                           \
+    in1 = (half*)malloc(size);                               \
+    in2 = (half*)malloc(size);                               \
+    out = (bool*)malloc(1);                                  \
+    in1[0] = half(float16(v_in1));                           \
+    in2[0] = half(float16(v_in2));                           \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);    \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);    \
+    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                  \
+    cudaMemcpy(out, d_out, 1, cudaMemcpyDeviceToHost);       \
+    EXPECT_EQ(out[0], v_out);                                \
+    free(in1);                                               \
+    free(in2);                                               \
+    free(out);                                               \
+    cudaFree(d_in1);                                         \
+    cudaFree(d_in2);                                         \
+    cudaFree(d_out);                                         \
+  }
+
+#ifdef PADDLE_CUDA_FP16
+namespace paddle {
+
+#if CUDA_VERSION < 9000
+ARITHMETIC_KERNEL(Add, +)
+ARITHMETIC_KERNEL(Sub, -)
+ARITHMETIC_KERNEL(Mul, *)
+ARITHMETIC_KERNEL(Div, /)
+
+ARITHMETIC_KERNEL_LAUNCH(Add)
+ARITHMETIC_KERNEL_LAUNCH(Sub)
+ARITHMETIC_KERNEL_LAUNCH(Mul)
+ARITHMETIC_KERNEL_LAUNCH(Div)
+
+// Negative sign kernel
+__global__ void Neg(half* in) { in[0] = -in[0]; }
+
+void TestNeg(float v_in, float v_out) {
+  LOG(INFO) << "Test Neg on GPU!";
+  half *in, *d_in;
+  int size = sizeof(half);
+  cudaMalloc((void**)&d_in, size);
+  in = (half*)malloc(size);
+  in[0] = half(float16(v_in));
+  cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
+  Neg<<<1, 1>>>(d_in);
+  cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
+  EXPECT_EQ(float(float16(in[0])), v_out);
+  free(in);
+  cudaFree(d_in);
+}
+
+COMPOUND_KERNEL(AddAssign, +=)
+COMPOUND_KERNEL(SubAssign, -=)
+COMPOUND_KERNEL(MulAssign, *=)
+COMPOUND_KERNEL(DivAssign, /=)
+
+COMPOUND_KERNEL_LAUNCH(AddAssign)
+COMPOUND_KERNEL_LAUNCH(SubAssign)
+COMPOUND_KERNEL_LAUNCH(MulAssign)
+COMPOUND_KERNEL_LAUNCH(DivAssign)
+
+COMPARISON_KERNEL(Equal, ==)
+COMPARISON_KERNEL(NotEqual, !=)
+COMPARISON_KERNEL(Less, <)
+COMPARISON_KERNEL(LessEqual, <=)
+COMPARISON_KERNEL(Greater, >)
+COMPARISON_KERNEL(GreaterEqual, >=)
+
+COMPARISON_KERNEL_LAUNCH(Equal)
+COMPARISON_KERNEL_LAUNCH(NotEqual)
+COMPARISON_KERNEL_LAUNCH(Less)
+COMPARISON_KERNEL_LAUNCH(LessEqual)
+COMPARISON_KERNEL_LAUNCH(Greater)
+COMPARISON_KERNEL_LAUNCH(GreaterEqual)
+
+TEST(float16, arithmetic_on_gpu) {
+  TestAdd(1, 2, 3);
+  TestSub(2, 1, 1);
+  TestMul(2, 3, 6);
+  TestDiv(6, 2, 3);
+  TestNeg(1, -1);
+}
+
+TEST(float16, compound_on_gpu) {
+  TestAddAssign(1, 2, 3);
+  TestSubAssign(2, 1, 1);
+  TestMulAssign(2, 3, 6);
+  TestDivAssign(6, 2, 3);
+}
+
+TEST(float16, comparision_on_gpu) {
+  TestEqual(1, 1, true);
+  TestEqual(1, 2, false);
+  TestNotEqual(2, 3, true);
+  TestNotEqual(2, 2, false);
+  TestLess(3, 4, true);
+  TestLess(3, 3, false);
+  TestLessEqual(3, 3, true);
+  TestLessEqual(3, 2, false);
+  TestGreater(4, 3, true);
+  TestGreater(4, 4, false);
+  TestGreaterEqual(4, 4, true);
+  TestGreaterEqual(4, 5, false);
+}
+#endif  // CUDA_VERSION
+
+TEST(float16, conversion_on_gpu) {
+  // Explicit conversion to and from cuda half
+  EXPECT_EQ(float16(half(float16(1.0f))).x, 0x3c00);
+  EXPECT_EQ(float16(half(float16(0.5f))).x, 0x3800);
+  EXPECT_EQ(float16(half(float16(0.33333f))).x, 0x3555);
+  EXPECT_EQ(float16(half(float16(0.0f))).x, 0x0000);
+  EXPECT_EQ(float16(half(float16(-0.0f))).x, 0x8000);
+  EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff);
+  EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00);
+
+  // Assignment operator
+  float16 v_assign;
+  v_assign = half(float16(1.0f));
+  EXPECT_EQ(v_assign.x, 0x3c00);
+}
+
+}  // namespace paddle
+#endif  // PADDLE_CUDA_FP16
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 7e5a1db44a5302e3b4e5d2768755824666e880ba..afb8d9d599b15a0b6d19b7ecca5e91b623695dea 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -244,7 +244,7 @@ TEST(Matrix, unary) {
     LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK"
                  << "support so we cannot test matrix inverse. To test "
                  << "matrix inverse, please install LAPACKE "
-                 << "and MKL/Openblas/ATLAS, and re-build PaddlePaddle.";
+                 << "and MKL/Openblas, and re-build PaddlePaddle.";
 #endif
   }
 }
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index aed5275dbf9be707cc6e19e729133ba8eab58195..8841c14ee083fccfd2271efd0c331805919a09d9 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_subdirectory(detail)
 
-cc_library(memory SRCS memory.cc DEPS place)
+cc_library(memory SRCS memory.cc DEPS place enforce)
 cc_library(memcpy SRCS memcpy.cc)
 
 cc_library(paddle_memory
diff --git a/paddle/memory/README.md b/paddle/memory/README.md
index 6cb003c50bc7d142d65b0591e7e5235431d2ea42..7cf61d089b39041b7a15184e0ea9211d14a66f5e 100644
--- a/paddle/memory/README.md
+++ b/paddle/memory/README.md
@@ -12,13 +12,13 @@ p = memory::Alloc(platform::CPUPlace(), 4*1024);
 To allocate 4KB memory on the 3rd GPU:
 
 ```cpp
-p = memory::Alloc(platform::GPUPlace(2), 4*1024);
+p = memory::Alloc(platform::CUDAPlace(2), 4*1024);
 ```
 
 To free memory and check the so-far used amount of memory on a place:
 
 ```cpp
-auto pl = platform::GPUPlace(0);
+auto pl = platform::CUDAPlace(0);
 p = memory::Alloc(pl, 4*1024);
 cout << memory::Used(pl);
 memory::Free(pl, p);
@@ -36,7 +36,7 @@ template <typename Place> size_t Used(Place);
 }  // namespace memory
 ```
 
-These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
+These function templates have specializations on either `platform::CPUPlace` or `platform::CUDAPlace`:
 
 ```cpp
 template<>
@@ -49,7 +49,7 @@ and
 
 ```cpp
 template<>
-void Alloc<GPUPlace>(GPUPlace p, size_t size) {
+void Alloc<CUDAPlace>(CUDAPlace p, size_t size) {
   return GetGPUBuddyAllocator(p.id)->Alloc(size);
 }
 ```
@@ -122,7 +122,7 @@ There are two implementations of `Context`:
 
 1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
 
-1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202).  This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member.   `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
+1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202).  This looks very similar to class `majel::CUDAPlace`, who also has an `int id_` data member.   `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
 
 ### Majel
 
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index 64ee53803891f192302bb915027f0499dfa36411..2bc2c06a15702b29c8bdf755978aebe0e6219b4a 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/memory/detail/buddy_allocator.h"
 #include "glog/logging.h"
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 9c41378483993101a098fc4ad1068c1ef908e566..4e0135dd655d04b7f99722a2159795738c1b29c7 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
index fc40993208323f1f5d18103165c8835b5f829613..f50eceba096477d7b2f50f7c406770c8e9595332 100644
--- a/paddle/memory/detail/memory_block.cc
+++ b/paddle/memory/detail/memory_block.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/memory/detail/memory_block.h"
 #include "paddle/memory/detail/meta_cache.h"
diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h
index a5168b519f3a3747f34ef2ea7b87d72dce70064d..a4ca51b31b0df269b4b48f824bfd2b74f2a059fb 100644
--- a/paddle/memory/detail/memory_block.h
+++ b/paddle/memory/detail/memory_block.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/memory/detail/meta_cache.cc b/paddle/memory/detail/meta_cache.cc
index 7e2f92b00ca5d787c1114176c5dc3304ca3ebe26..2bacca75108f9f80e7aa291fcb4fd66112201394 100644
--- a/paddle/memory/detail/meta_cache.cc
+++ b/paddle/memory/detail/meta_cache.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/memory/detail/meta_cache.h"
 #include "glog/logging.h"
diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h
index cf5815644284c23a1d2abc904f8c5053ce107a72..db8ffd49ae30cf72ca691894af2df08a7106d02f 100644
--- a/paddle/memory/detail/meta_cache.h
+++ b/paddle/memory/detail/meta_cache.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/memory/detail/meta_data.cc b/paddle/memory/detail/meta_data.cc
index 70c5c1f439e84ec33cf0507beae33f9cdfa51727..dc57d4d2376ab1cfe6fb49c92af4591b3972a53a 100644
--- a/paddle/memory/detail/meta_data.cc
+++ b/paddle/memory/detail/meta_data.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/memory/detail/meta_data.h"
 
diff --git a/paddle/memory/detail/meta_data.h b/paddle/memory/detail/meta_data.h
index 628cf1f2e347e288d1bf34c14c7b2f13a28d3662..6b83c42eb851f0487bed6d625d848cf90db00929 100644
--- a/paddle/memory/detail/meta_data.h
+++ b/paddle/memory/detail/meta_data.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 6b4e46f56a0c9c9836c5b353ec9c554454ab0491..509250debc2b2fd2e87078ab5f233ae2db6fd898 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock
+#include <algorithm>   // for std::max
 
 #include "gflags/gflags.h"
 
@@ -28,7 +29,7 @@ limitations under the License. */
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
 DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
-
+DECLARE_double(fraction_of_gpu_memory_to_use);
 namespace paddle {
 namespace memory {
 namespace detail {
@@ -43,7 +44,7 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) {
 
   void* p;
 
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
   // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
   // memory alignment
   PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0);
@@ -77,45 +78,20 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
   // if size is 0.  We just make sure it does.
   if (size <= 0) return nullptr;
-
-  size_t available = 0;
-  size_t capacity = 0;
-  paddle::platform::GpuMemoryUsage(available, capacity);
-
-  // Reserve memory for page tables, etc.
-  size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
-  size_t usable = available > reserving ? available - reserving : 0;
-
-  // If remaining size no less than expected size, using general
-  // cudaMalloc to allocate GPU memory.
-  void* p = 0;
-  if (size <= usable) {
-    cudaError_t result = cudaMalloc(&p, size);
-    if (result == cudaSuccess) {
-      index = 0;
-      gpu_alloc_size_ += size;
-      return p;
-    }
-  }
-
-  // If remaining size less than expected size or cudaMalloc failed,
-  // cudaMallocHost will be considered as a fallback allocator.
-  //
-  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
-  // of host fallback allocation. Allocates too much would reduce
-  // the amount of memory available to the underlying system for paging.
-  usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
-
-  if (size > usable) return nullptr;
-
-  cudaError_t result = cudaMallocHost(&p, size);
+  void* p;
+  cudaError_t result = cudaMalloc(&p, size);
   if (result == cudaSuccess) {
-    index = 1;
-    fallback_alloc_size_ += size;
+    index = 0;
+    gpu_alloc_size_ += size;
     return p;
+  } else {
+    LOG(WARNING)
+        << "Cannot malloc " << size / 1024.0 / 1024.0
+        << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use "
+           "environment variable to a lower value. Current value is "
+        << FLAGS_fraction_of_gpu_memory_to_use;
+    return nullptr;
   }
-
-  return nullptr;
 }
 
 void GPUAllocator::Free(void* p, size_t size, size_t index) {
diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
index 1df88a6da9fb0c50d0d7ecd083c0533d8a886a67..b46141aafd7146bd3def12d86108c10f1f143d20 100644
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -28,31 +28,25 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
 
 #ifdef PADDLE_WITH_CUDA
 template <>
-void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::GPUPlace src_place,
-                                                  const void* src, size_t num,
-                                                  cudaStream_t stream) {
+void Copy<platform::CPUPlace, platform::CUDAPlace>(
+    platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
+    const void* src, size_t num, cudaStream_t stream) {
   platform::SetDeviceId(src_place.device);
   platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
 }
 
 template <>
-void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::CPUPlace src_place,
-                                                  const void* src, size_t num,
-                                                  cudaStream_t stream) {
+void Copy<platform::CUDAPlace, platform::CPUPlace>(
+    platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
+    const void* src, size_t num, cudaStream_t stream) {
   platform::SetDeviceId(dst_place.device);
   platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
 }
 
 template <>
-void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::GPUPlace src_place,
-                                                  const void* src, size_t num,
-                                                  cudaStream_t stream) {
+void Copy<platform::CUDAPlace, platform::CUDAPlace>(
+    platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
+    const void* src, size_t num, cudaStream_t stream) {
   if (dst_place == src_place) {
     platform::SetDeviceId(src_place.device);
     platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
@@ -62,33 +56,6 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
   }
 }
 
-template <>
-void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::GPUPlace src_place,
-                                                  const void* src, size_t num) {
-  platform::SetDeviceId(src_place.device);
-  platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
-}
-
-template <>
-void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::CPUPlace src_place,
-                                                  const void* src, size_t num) {
-  platform::SetDeviceId(dst_place.device);
-  platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
-}
-
-template <>
-void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::GPUPlace src_place,
-                                                  const void* src, size_t num) {
-  platform::SetDeviceId(dst_place.device);
-  platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
-}
-
 #endif
 
 }  // namespace memory
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 5eb1c44eb6fc45db31ef44bf79e74b79193e08aa..1a73a94567e45b81a0b148965a834f03c7407ffe 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -64,38 +64,71 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
     int gpu_num = platform::GetCUDADeviceCount();
     as = new BuddyAllocator*[gpu_num];
     for (int gpu = 0; gpu < gpu_num; gpu++) {
-      platform::SetDeviceId(gpu);
-      as[gpu] = new BuddyAllocator(new detail::GPUAllocator,
-                                   platform::GpuMinChunkSize(),
-                                   platform::GpuMaxChunkSize());
+      as[gpu] = nullptr;
     }
+  }
+  platform::SetDeviceId(gpu_id);
+  if (!as[gpu_id]) {
+    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator,
+                                    platform::GpuMinChunkSize(),
+                                    platform::GpuMaxChunkSize());
     VLOG(10) << "\n\nNOTE: each GPU device use "
              << FLAGS_fraction_of_gpu_memory_to_use * 100
              << "% of GPU memory.\n"
-             << "You can set environment variable '"
-             << platform::kEnvFractionGpuMemoryToUse
+             << "You can set GFlags environment variable '"
+             << "FLAGS_fraction_of_gpu_memory_to_use"
              << "' to change the fraction of GPU usage.\n\n";
   }
-  platform::SetDeviceId(gpu_id);
   return as[gpu_id];
 }
 
 template <>
-void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
-  return GetGPUBuddyAllocator(place.device)->Alloc(size);
+size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
+  return GetGPUBuddyAllocator(place.device)->Used();
 }
 
 template <>
-void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
-  GetGPUBuddyAllocator(place.device)->Free(p);
+void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
+  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
+  auto* ptr = buddy_allocator->Alloc(size);
+  if (ptr == nullptr) {
+    int cur_dev = platform::GetCurrentDeviceId();
+    platform::SetDeviceId(place.device);
+    size_t avail, total;
+    platform::GpuMemoryUsage(avail, total);
+    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
+                 << place.device << ", available " << avail << " bytes";
+    LOG(WARNING) << "total " << total;
+    LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
+    LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
+    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
+    platform::SetDeviceId(cur_dev);
+  }
+  return ptr;
 }
 
 template <>
-size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
-  return GetGPUBuddyAllocator(place.device)->Used();
+void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
 }
 
 #endif
 
+size_t Usage::operator()(const platform::CPUPlace& cpu) const {
+  return Used(cpu);
+}
+
+size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
+#ifdef PADDLE_WITH_CUDA
+  return Used(gpu);
+#else
+  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+}
+
+size_t memory_usage(const platform::Place& p) {
+  return boost::apply_visitor(Usage(), p);
+}
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 11bbb881874ec50e1132547336fc6fb6b42bcc4f..7012b6d331d0c4631a3d120fbaf3db7c97298ac7 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -54,6 +54,13 @@ void Free(Place place, void* ptr);
 template <typename Place>
 size_t Used(Place place);
 
+struct Usage : public boost::static_visitor<size_t> {
+  size_t operator()(const platform::CPUPlace& cpu) const;
+  size_t operator()(const platform::CUDAPlace& gpu) const;
+};
+
+size_t memory_usage(const platform::Place& p);
+
 /**
  * \brief   Free memory block in one place.
  *
diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 2444931e26774ae80b916fbb7bd46ff93025d9ed..b3f699f9b7eff54c06ff69023db082380c83467a 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -44,6 +44,9 @@ TEST(BuddyAllocator, CPUAllocation) {
 
   EXPECT_NE(p, nullptr);
 
+  paddle::platform::Place place = cpu;
+  EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
+
   paddle::memory::Free(cpu, p);
 }
 
@@ -82,7 +85,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
 
 #ifdef PADDLE_WITH_CUDA
 
-size_t align(size_t size, paddle::platform::GPUPlace place) {
+size_t align(size_t size, paddle::platform::CUDAPlace place) {
   size += sizeof(paddle::memory::detail::Metadata);
   size_t alignment = paddle::platform::GpuMinChunkSize();
   size_t remaining = size % alignment;
@@ -94,16 +97,19 @@ TEST(BuddyAllocator, GPUAllocation) {
 
   EXPECT_EQ(p, nullptr);
 
-  paddle::platform::GPUPlace gpu(0);
+  paddle::platform::CUDAPlace gpu(0);
   p = paddle::memory::Alloc(gpu, 4096);
 
   EXPECT_NE(p, nullptr);
 
+  paddle::platform::Place place = gpu;
+  EXPECT_EQ(paddle::memory::Used(gpu), paddle::memory::memory_usage(place));
+
   paddle::memory::Free(gpu, p);
 }
 
 TEST(BuddyAllocator, GPUMultAlloc) {
-  paddle::platform::GPUPlace gpu;
+  paddle::platform::CUDAPlace gpu;
 
   std::unordered_map<void *, size_t> ps;
 
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 059a6bba84cfb0c1f6cbbba3c88d589b52dc5592..2412ebd82a02c872e73fd310c56221309441f630 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -1,5 +1,6 @@
 file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
+set(DEPS_OPS "")
 set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/pybind/pybind.h)
 file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
 function(op_library TARGET)
@@ -48,6 +49,10 @@ function(op_library TARGET)
         message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
     endif()
 
+    list(LENGTH op_library_DEPS op_library_DEPS_len)
+    if (${op_library_DEPS_len} GREATER 0)
+        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
+    endif()
     if (WITH_GPU)
         nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
@@ -56,99 +61,28 @@ function(op_library TARGET)
                 ${op_common_deps})
     endif()
 
-    # net_op doesn't need pybind
-    if ("${TARGET}" STREQUAL "net_op")
-        set(pybind_flag 1)
-    endif()
-
-    if ("${TARGET}" STREQUAL "compare_op")
-        set(pybind_flag 1)
-        file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
-    endif()
-
-    # conv_op contains several operators
-    if ("${TARGET}" STREQUAL "conv_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
-    endif()
-
-    # pool_op contains several operators
-    if ("${TARGET}" STREQUAL "pool_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
-    endif()
-
-    # pool_cudnn_op contains several operators
-    if ("${TARGET}" STREQUAL "pool_cudnn_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
-    endif()
-
-    if ("${TARGET}" STREQUAL "logical_op")
-        set(pybind_flag 1)
-        file(APPEND ${pybind_file} "USE_OP(logical_and);\n")
-    endif()
-
-    # pool_with_index_op contains several operators
-    if ("${TARGET}" STREQUAL "pool_with_index_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
-    endif()
-
-    # conv_transpose_op contains several operators
-    if ("${TARGET}" STREQUAL "conv_transpose_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
-    endif()
-
-    # conv_transpose_cudnn_op contains two operators
-    if ("${TARGET}" STREQUAL "conv_transpose_cudnn_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose_cudnn);\n")
-    endif()
-
-    # save_restore_op contains several operators
-    if ("${TARGET}" STREQUAL "save_restore_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(save);\n")
-    endif()
-
-    # activation_op contains several operators
-    if ("${TARGET}" STREQUAL "activation_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(sigmoid);\n")
-    endif()
-
-    # nccl_op contains several operators
-    if ("${TARGET}" STREQUAL "nccl_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
-    endif()
-
-    # reduce_op contains several operators
-    if ("${TARGET}" STREQUAL "reduce_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
-    endif()
+    # Define operators that don't need pybind here.
+    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
+            set(pybind_flag 1)
+        endif()
+    endforeach()
 
-    if ("${TARGET}" STREQUAL "tensor_array_read_write_op")
-        set(pybind_flag 1)
-        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(write_to_array);\n")
+    # The registration of USE_OP, please refer to paddle/framework/op_registry.h.
+    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
+    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
+    file(READ ${TARGET}.cc TARGET_CONTENT)
+    string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
+    string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
+    if (one_register STREQUAL "")
+        string(REPLACE "_op" "" TARGET "${TARGET}")
+    else ()
+        string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}")
+        string(REPLACE "," "" TARGET "${TARGET}")
     endif()
 
     # pybind USE_NO_KERNEL_OP
     # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
-    file(READ ${TARGET}.cc TARGET_CONTENT)
     string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
     string(REPLACE "_op" "" TARGET "${TARGET}")
     if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
@@ -159,7 +93,6 @@ function(op_library TARGET)
     # pybind USE_CPU_ONLY_OP
     list(LENGTH cu_srcs cu_srcs_len)
     list(LENGTH cu_cc_srcs cu_cc_srcs_len)
-
     if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
         file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
         set(pybind_flag 1)
@@ -174,82 +107,87 @@ endfunction()
 add_subdirectory(math)
 add_subdirectory(nccl)
 
-set(DEPS_OPS
-    cond_op
-    cross_entropy_op
-    recurrent_op
-    dynamic_recurrent_op
-    softmax_with_cross_entropy_op
-    softmax_op
-    sequence_softmax_op
-    sum_op
-    pool_op
-    maxout_op
-    pool_with_index_op
-    conv_op
-    conv_transpose_op
-    nccl_op
-    sequence_conv_op
-    sequence_pool_op
-    lod_rank_table_op
-    lod_tensor_to_array_op
-    array_to_lod_tensor_op
-    lstm_op
-    tensor_array_read_write_op
-    gru_op
-    adagrad_op
-    sgd_op)
-
-
-op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
+if(WITH_GPU)
+    op_library(nccl_op DEPS nccl_common)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
+else()
+    set(DEPS_OPS ${DEPS_OPS} nccl_op)
+endif()
+
+if(WITH_DISTRIBUTE)
+    add_subdirectory(detail)
+    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
+else()
+    set(DEPS_OPS ${DEPS_OPS} send_op recv_op)
+endif()
+
+op_library(cond_op DEPS framework_proto tensor net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
+op_library(detection_output_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
 op_library(sum_op DEPS selected_rows_functor)
 op_library(sgd_op DEPS selected_rows_functor)
+op_library(print_op DEPS lod_tensor)
 op_library(adagrad_op DEPS selected_rows_functor)
-op_library(conv_op DEPS vol2col)
-op_library(pool_op DEPS pooling)
 op_library(maxout_op DEPS maxouting)
+op_library(unpool_op DEPS unpooling)
 op_library(pool_with_index_op DEPS pooling)
-op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
-op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
-op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op)
-op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
-if(WITH_GPU)
-op_library(nccl_op DEPS nccl_common)
-endif()
+op_library(lod_rank_table_op DEPS lod_rank_table)
+op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
+op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
+op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
-op_library(conv_transpose_op DEPS vol2col)
 op_library(gru_op DEPS sequence2batch gru_compute)
-if(WITH_TESTING)
-    op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
-        DEPS net_op tensor_array gtest)
+op_library(recurrent_op DEPS executor)
+op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function)
+op_library(cos_sim_op DEPS cos_sim_functor)
+op_library(parallel_do_op DEPS executor)
+
+# Regist multiple Kernel to pybind
+if (WITH_GPU)
+op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col)
+op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
+op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
+  conv_transpose_cudnn_op.cu.cc DEPS vol2col)
+file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d, CUDNN);\n")
+file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(pool2d, CUDNN);\n")
+file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d_transpose, CUDNN);\n")
 else()
-    op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
-            DEPS net_op tensor_array)
+op_library(conv_op SRCS conv_op.cc DEPS vol2col)
+op_library(pool_op SRCS pool_op.cc DEPS pooling)
+op_library(conv_transpose_op SRCS conv_transpose_op.cc DEPS vol2col)
 endif()
-op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
+
+# FIXME(typhoonzero): save/load depends lodtensor serialization functions
+op_library(save_op DEPS lod_tensor)
+op_library(load_op DEPS lod_tensor)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
     op_library(${src})
 endforeach()
+file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
+
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
-cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc
-        rnn/recurrent_op_utils.cc
-        DEPS dynamic_recurrent_op)
 if(WITH_GPU)
-  cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+    cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 2785a8c6fb62527db4d203788be88ebead068a19..8e8a3c7dd3036317fac29b709d7a29e18f017503 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -53,18 +53,17 @@ class AccuracyOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
-        ctx.device_context());
+        ctx.GetPlace());
   }
 };
 
 class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AccuracyOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  AccuracyOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     // TODO(typhoonzero): support both inference value and indices.
     AddInput("Out", "The network output of topk (inferences)");
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index d2dcab4e548b99c6beecfaa570ac31804fd07d82..0aadd5af41531e54b357756441f92da668d4ec01 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -26,7 +26,7 @@ template <int BlockSize>
 __global__ void AccuracyCudaKernel(const int N, const int D,
                                    const int64_t* Xdata,
                                    const int64_t* labeldata, int* correct_data,
-                                   float* accuracy) {
+                                   float* accuracy, int* total_data) {
   int count = 0;
   __shared__ int total[BlockSize];
 
@@ -47,6 +47,7 @@ __global__ void AccuracyCudaKernel(const int N, const int D,
   if (threadIdx.x == 0) {
     *correct_data = result;
     *accuracy = static_cast<float>(result) / static_cast<float>(N);
+    *total_data = N;
   }
 }
 
@@ -55,7 +56,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
     auto* inference = ctx.Input<Tensor>("Out");
     auto* indices = ctx.Input<Tensor>("Indices");
     auto* label = ctx.Input<Tensor>("Label");
@@ -80,22 +81,11 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
     if (num_samples == 0) {
       return;
     }
-    platform::GpuMemcpyAsync(total_data, &num_samples, sizeof(int),
-                             cudaMemcpyHostToDevice, stream);
 
     AccuracyCudaKernel<
         PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
         num_samples, infer_width, indices_data, label_data, correct_data,
-        accuracy_data);
-
-    int d_num_samples, d_num_correct;
-    float d_accuracy;
-    platform::GpuMemcpyAsync(&d_num_correct, correct_data, sizeof(int),
-                             cudaMemcpyDeviceToHost, stream);
-    platform::GpuMemcpyAsync(&d_num_samples, total_data, sizeof(int),
-                             cudaMemcpyDeviceToHost, stream);
-    platform::GpuMemcpyAsync(&d_accuracy, accuracy_data, sizeof(float),
-                             cudaMemcpyDeviceToHost, stream);
+        accuracy_data, total_data);
   }
 };
 
@@ -104,5 +94,6 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 
 // FIXME(typhoonzero): types of T is for inference data.
 // label data is always int64
-REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
-                       paddle::operators::AccuracyOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(accuracy,
+                        paddle::operators::AccuracyOpCUDAKernel<float>,
+                        paddle::operators::AccuracyOpCUDAKernel<double>);
diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h
index d060e6edddb31ecc1a4d27836f80b8ac5fa7d36d..04104a695fac6a967ad94780e31ba3fdd2ca2eda 100644
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@@ -21,7 +21,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AccuracyKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index 83d35a450d0e8ebf5311cdfd948b066642ccec8c..4188858a90daf8b2c10eb6960393de977d467371 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/activation_op.h"
 
@@ -22,8 +22,8 @@ class ActivationOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Y");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -32,21 +32,20 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Y"));
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
   }
 };
 
 class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SigmoidOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Sigmoid operator");
-    AddOutput("Y", "Output of Sigmoid operator");
+    AddOutput("Out", "Output of Sigmoid operator");
     AddComment(R"DOC(
-Sigmoid Activation Operator.
+Sigmoid Activation Operator
 
-$y = 1 / (1 + e^{-x})$
+$$out = \frac{1}{1 + e^{-x}}$$
 
 )DOC");
   }
@@ -54,15 +53,14 @@ $y = 1 / (1 + e^{-x})$
 
 class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LogSigmoidOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  LogSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of LogSigmoid operator");
-    AddOutput("Y", "Output of LogSigmoid operator");
+    AddOutput("Out", "Output of LogSigmoid operator");
     AddComment(R"DOC(
-Logsigmoid Activation Operator.
+Logsigmoid Activation Operator
 
-$y = \log(1 / (1 + e^{-x}))$
+$$out = \log \frac{1}{1 + e^{-x}}$$
 
 )DOC");
   }
@@ -70,14 +68,14 @@ $y = \log(1 / (1 + e^{-x}))$
 
 class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ExpOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ExpOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Exp operator");
-    AddOutput("Y", "Output of Exp operator");
+    AddOutput("Out", "Output of Exp operator");
     AddComment(R"DOC(
 Exp Activation Operator.
 
-$y = e^x$
+$out = e^x$
 
 )DOC");
   }
@@ -85,53 +83,47 @@ $y = e^x$
 
 class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Relu operator");
-    AddOutput("Y", "Output of Relu operator");
+    AddOutput("Out", "Output of Relu operator");
     AddComment(R"DOC(
 Relu Activation Operator.
 
-$y = \max(x, 0)$
+$out = \max(x, 0)$
 
 )DOC");
   }
 };
 
-template <typename AttrType>
 class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LeakyReluOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  LeakyReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of LeakyRelu operator");
-    AddOutput("Y", "Output of LeakyRelu operator");
-    AddAttr<AttrType>("alpha", "The small negative slope")
-        .SetDefault(static_cast<AttrType>(0.02f));
+    AddOutput("Out", "Output of LeakyRelu operator");
+    AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
     AddComment(R"DOC(
 LeakyRelu Activation Operator.
 
-$y = \max(x, \alpha * x)$
+$out = \max(x, \alpha * x)$
 
 )DOC");
   }
 };
 
-template <typename AttrType>
 class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftShrinkOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Softshrink operator");
-    AddOutput("Y", "Output of Softshrink operator");
-    AddAttr<AttrType>("lambda", "non-negative offset")
-        .SetDefault(static_cast<AttrType>(0.5f));
+    AddOutput("Out", "Output of Softshrink operator");
+    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
     AddComment(R"DOC(
 Softshrink Activation Operator.
 
 $$
-y = \begin{cases} 
+out = \begin{cases} 
     x - \lambda, \text{if } x > \lambda \\
     x + \lambda, \text{if } x < -\lambda \\
     0,  \text{otherwise}
@@ -144,14 +136,14 @@ $$
 
 class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  TanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Tanh operator");
-    AddOutput("Y", "Output of Tanh operator");
+    AddOutput("Out", "Output of Tanh operator");
     AddComment(R"DOC(
 Tanh Activation Operator.
 
-$$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+$$out = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
 )DOC");
   }
@@ -159,35 +151,32 @@ $$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
 class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TanhShrinkOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  TanhShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of TanhShrink operator");
-    AddOutput("Y", "Output of TanhShrink operator");
+    AddOutput("Out", "Output of TanhShrink operator");
     AddComment(R"DOC(
 TanhShrink Activation Operator.
 
-$$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+$$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
 )DOC");
   }
 };
 
-template <typename AttrType>
 class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HardShrinkOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  HardShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of HardShrink operator");
-    AddOutput("Y", "Output of HardShrink operator");
-    AddAttr<AttrType>("threshold", "The value of threshold for HardShrink")
-        .SetDefault(static_cast<AttrType>(0.5));
+    AddOutput("Out", "Output of HardShrink operator");
+    AddAttr<float>("threshold", "The value of threshold for HardShrink")
+        .SetDefault(0.5f);
     AddComment(R"DOC(
 HardShrink Activation Operator.
 
 $$
-y = \begin{cases} 
+out = \begin{cases} 
     x, \text{if } x > \lambda \\
     x, \text{if } x < -\lambda \\
     0,  \text{otherwise}
@@ -200,14 +189,14 @@ $$
 
 class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SqrtOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SqrtOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Sqrt operator");
-    AddOutput("Y", "Output of Sqrt operator");
+    AddOutput("Out", "Output of Sqrt operator");
     AddComment(R"DOC(
 Sqrt Activation Operator.
 
-$y = \sqrt{x}$
+$out = \sqrt{x}$
 
 )DOC");
   }
@@ -215,14 +204,59 @@ $y = \sqrt{x}$
 
 class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AbsOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  AbsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Abs operator");
-    AddOutput("Y", "Output of Abs operator");
+    AddOutput("Out", "Output of Abs operator");
     AddComment(R"DOC(
 Abs Activation Operator.
 
-$y = |x|$
+$out = |x|$
+
+)DOC");
+  }
+};
+
+class CeilOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CeilOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Ceil operator");
+    AddOutput("Out", "Output of Ceil operator");
+    AddComment(R"DOC(
+Ceil Activation Operator.
+
+$out = ceil(x)$
+
+)DOC");
+  }
+};
+
+class FloorOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FloorOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Floor operator");
+    AddOutput("Out", "Output of Floor operator");
+    AddComment(R"DOC(
+Floor Activation Operator.
+
+$out = floor(x)$
+
+)DOC");
+  }
+};
+
+class RoundOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Round operator");
+    AddOutput("Out", "Output of Round operator");
+    AddComment(R"DOC(
+Round Activation Operator.
+
+$out = [x]$
 
 )DOC");
   }
@@ -230,15 +264,14 @@ $y = |x|$
 
 class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReciprocalOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ReciprocalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Reciprocal operator");
-    AddOutput("Y", "Output of Reciprocal operator");
+    AddOutput("Out", "Output of Reciprocal operator");
     AddComment(R"DOC(
 Reciprocal Activation Operator.
 
-$$y = \frac{1}{x}$$
+$$out = \frac{1}{x}$$
 
 )DOC");
   }
@@ -246,14 +279,14 @@ $$y = \frac{1}{x}$$
 
 class LogOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LogOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  LogOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Log operator");
-    AddOutput("Y", "Output of Log operator");
+    AddOutput("Out", "Output of Log operator");
     AddComment(R"DOC(
 Log Activation Operator.
 
-$y = \ln(x)$
+$out = \ln(x)$
 
 Natural logarithm of x.
 
@@ -263,14 +296,14 @@ Natural logarithm of x.
 
 class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SquareOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SquareOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Square operator");
-    AddOutput("Y", "Output of Square operator");
+    AddOutput("Out", "Output of Square operator");
     AddComment(R"DOC(
 Square Activation Operator.
 
-$y = x^2$
+$out = x^2$
 
 )DOC");
   }
@@ -278,15 +311,14 @@ $y = x^2$
 
 class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftplusOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftplusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Softplus operator");
-    AddOutput("Y", "Output of Softplus operator");
+    AddOutput("Out", "Output of Softplus operator");
     AddComment(R"DOC(
 Softplus Activation Operator.
 
-$y = \ln(1 + e^{x})$
+$out = \ln(1 + e^{x})$
 
 )DOC");
   }
@@ -294,151 +326,139 @@ $y = \ln(1 + e^{x})$
 
 class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftsignOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftsignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Softsign operator");
-    AddOutput("Y", "Output of Softsign operator");
+    AddOutput("Out", "Output of Softsign operator");
     AddComment(R"DOC(
 Softsign Activation Operator.
 
-$$y = \frac{x}{1 + |x|}$$
+$$out = \frac{x}{1 + |x|}$$
 
 )DOC");
   }
 };
 
-template <typename AttrType>
 class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  BReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of BRelu operator");
-    AddOutput("Y", "Output of BRelu operator");
-    AddAttr<AttrType>("t_min", "The min marginal value of BRelu")
-        .SetDefault(static_cast<AttrType>(0));
-    AddAttr<AttrType>("t_max", "The max marginal value of BRelu")
-        .SetDefault(static_cast<AttrType>(24));
+    AddOutput("Out", "Output of BRelu operator");
+    AddAttr<float>("t_min", "The min marginal value of BRelu")
+        .SetDefault(static_cast<float>(0));
+    AddAttr<float>("t_max", "The max marginal value of BRelu")
+        .SetDefault(static_cast<float>(24));
     AddComment(R"DOC(
 BRelu Activation Operator.
 
-$y = \max(\min(x, t_{min}), t_{max})$
+$out = \max(\min(x, t_{min}), t_{max})$
 
 )DOC");
   }
 };
 
-template <typename AttrType>
 class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftReluOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of SoftRelu operator");
-    AddOutput("Y", "Output of SoftRelu operator");
-    AddAttr<AttrType>("threshold", "The threshold value of SoftRelu")
-        .SetDefault(static_cast<AttrType>(40));
+    AddOutput("Out", "Output of SoftRelu operator");
+    AddAttr<float>("threshold", "The threshold value of SoftRelu")
+        .SetDefault(40.0f);
     AddComment(R"DOC(
 SoftRelu Activation Operator.
 
-$y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
+$out = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
 
 )DOC");
   }
 };
 
-template <typename AttrType>
 class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ELUOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ELUOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of ELU operator");
-    AddOutput("Y", "Output of ELU operator");
-    AddAttr<AttrType>("alpha", "The alpha value of ELU")
-        .SetDefault(static_cast<AttrType>(1.0f));
+    AddOutput("Out", "Output of ELU operator");
+    AddAttr<float>("alpha", "The alpha value of ELU").SetDefault(1.0f);
     AddComment(R"DOC(
 ELU Activation Operator.
 
 Applies the following element-wise computation on the input according to
 https://arxiv.org/abs/1511.07289.
 
-$y = \max(0, x) + \min(0, \alpha * (e^x - 1))$
+$out = \max(0, x) + \min(0, \alpha * (e^x - 1))$
 
 )DOC");
   }
 };
 
-template <typename AttrType>
 class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Relu6OpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  Relu6OpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Relu6 operator");
-    AddOutput("Y", "Output of Relu6 operator");
-    AddAttr<AttrType>("threshold", "The threshold value of Relu6")
-        .SetDefault(static_cast<AttrType>(6));
+    AddOutput("Out", "Output of Relu6 operator");
+    AddAttr<float>("threshold", "The threshold value of Relu6")
+        .SetDefault(6.0f);
     AddComment(R"DOC(
 Relu6 Activation Operator.
 
-$y = \min(\max(0, x), 6)$
+$out = \min(\max(0, x), 6)$
 
 )DOC");
   }
 };
 
-template <typename AttrType>
 class PowOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PowOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  PowOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Pow operator");
-    AddOutput("Y", "Output of Pow operator");
-    AddAttr<AttrType>("factor", "The exponential factor of Pow")
-        .SetDefault(static_cast<AttrType>(1));
+    AddOutput("Out", "Output of Pow operator");
+    AddAttr<float>("factor", "The exponential factor of Pow").SetDefault(1.0f);
     AddComment(R"DOC(
 Pow Activation Operator.
 
-$y = x^{factor}$
+$out = x^{factor}$
 
 )DOC");
   }
 };
 
-template <typename AttrType>
 class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  STanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  STanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of STanh operator");
-    AddOutput("Y", "Output of STanh operator");
-    AddAttr<AttrType>("scale_a", "The scale parameter of a for the input")
-        .SetDefault(static_cast<AttrType>(2 / 3));
-    AddAttr<AttrType>("scale_b", "The scale parameter of b for the input")
-        .SetDefault(static_cast<AttrType>(1.7159));
+    AddOutput("Out", "Output of STanh operator");
+    AddAttr<float>("scale_a", "The scale parameter of a for the input")
+        .SetDefault(2.0f / 3.0f);
+    AddAttr<float>("scale_b", "The scale parameter of b for the input")
+        .SetDefault(1.7159f);
     AddComment(R"DOC(
 STanh Activation Operator.
 
-$$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
+$$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
 
 )DOC");
   }
 };
 
-template <typename AttrType>
 class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ThresholdedReluOpMaker(framework::OpProto *proto,
-                         framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ThresholdedReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of ThresholdedRelu operator");
-    AddOutput("Y", "Output of ThresholdedRelu operator");
-    AddAttr<AttrType>("threshold", "The threshold location of activation")
-        .SetDefault(static_cast<AttrType>(1.0));
+    AddOutput("Out", "Output of ThresholdedRelu operator");
+    AddAttr<float>("threshold", "The threshold location of activation")
+        .SetDefault(1.0f);
     AddComment(R"DOC(
 ThresholdedRelu Activation Operator.
 
 $$
-y = \begin{cases} 
+out = \begin{cases} 
     x, \text{if } x > threshold \\
     0,  \text{otherwise}
     \end{cases}
@@ -448,25 +468,23 @@ $$
   }
 };
 
-template <typename AttrType>
 class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HardSigmoidOpMaker(framework::OpProto *proto,
-                     framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  HardSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of HardSigmoid operator");
-    AddOutput("Y", "Output of HardSigmoid operator");
-    AddAttr<AttrType>("slope", "Slope for linear approximation of sigmoid")
-        .SetDefault(static_cast<AttrType>(0.2));
-    AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
-        .SetDefault(static_cast<AttrType>(0.5));
+    AddOutput("Out", "Output of HardSigmoid operator");
+    AddAttr<float>("slope", "Slope for linear approximation of sigmoid")
+        .SetDefault(0.2f);
+    AddAttr<float>("offset", "Offset for linear approximation of sigmoid")
+        .SetDefault(0.5f);
     AddComment(R"DOC(
 HardSigmoid Activation Operator.
 
 Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), 
 which is much faster than sigmoid.
 
-$y = \max(0, \min(1, slope * x + shift))$
+$out = \max(0, \min(1, slope * x + shift))$
 
 The slope should be positive. The offset can be either positive or negative.
 The default slope and shift are set according to the above reference.
@@ -476,6 +494,22 @@ It is recommended to use the defaults for this activation.
   }
 };
 
+class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SwishOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Swish operator");
+    AddOutput("Out", "Output of Swish operator");
+    AddAttr<float>("beta", "Constant beta of swish operator").SetDefault(1.0f);
+    AddComment(R"DOC(
+Swish Activation Operator.
+
+$$out = \frac{x}{1 + e^{- \beta x}}$$
+
+)DOC");
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -499,7 +533,7 @@ REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
 REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
             tanh_shrink_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker<float>,
+REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
             softshrink_grad, ops::ActivationOpGrad);
 
 REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
@@ -508,6 +542,15 @@ REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
 REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
             ops::ActivationOpGrad);
 
+REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad,
+            ops::ActivationOpGrad);
+
 REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
             reciprocal_grad, ops::ActivationOpGrad);
 
@@ -523,47 +566,50 @@ REGISTER_OP(softplus, ops::ActivationOp, ops::SoftplusOpMaker, softplus_grad,
 REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker<float>, brelu_grad,
+REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker, brelu_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker<float>,
+REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker,
             leaky_relu_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker<float>,
-            soft_relu_grad, ops::ActivationOpGrad);
+REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker, soft_relu_grad,
+            ops::ActivationOpGrad);
 
-REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker<float>, elu_grad,
+REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker, elu_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker<float>, relu6_grad,
+REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker, relu6_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker<float>, pow_grad,
+REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker, pow_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker<float>, stanh_grad,
+REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker, stanh_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker<float>,
+REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker,
             hard_shrink_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(thresholded_relu, ops::ActivationOp,
-            ops::ThresholdedReluOpMaker<float>, thresholded_relu_grad,
-            ops::ActivationOpGrad);
+REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker,
+            thresholded_relu_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker<float>,
+REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
             hard_sigmoid_grad, ops::ActivationOpGrad);
 
-#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)       \
-  REGISTER_OP_CPU_KERNEL(                                                     \
-      act_type,                                                               \
-      ops::ActivationKernel<paddle::platform::CPUPlace, ops::functor<float>>, \
-      ops::ActivationKernel<paddle::platform::CPUPlace,                       \
-                            ops::functor<double>>);                           \
-  REGISTER_OP_CPU_KERNEL(                                                     \
-      act_type##_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace,  \
-                                                 ops::grad_functor<float>>,   \
-      ops::ActivationGradKernel<paddle::platform::CPUPlace,                   \
+REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
+            ops::ActivationOpGrad);
+
+#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)   \
+  REGISTER_OP_CPU_KERNEL(                                                 \
+      act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
+                                      ops::functor<float>>,               \
+      ops::ActivationKernel<paddle::platform::CPUDeviceContext,           \
+                            ops::functor<double>>);                       \
+  REGISTER_OP_CPU_KERNEL(                                                 \
+      act_type##_grad,                                                    \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
+                                ops::grad_functor<float>>,                \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
                                 ops::grad_functor<double>>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu
index 97737857ab25dfa92163b64a750fd7a7d9ea0ac3..b9ccdf639cf4a9ea80d530e550c16089e50c44e0 100644
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@@ -1,32 +1,33 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/activation_op.h"
 
 namespace ops = paddle::operators;
 
-#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor)       \
-  REGISTER_OP_GPU_KERNEL(                                                     \
-      act_type,                                                               \
-      ops::ActivationKernel<paddle::platform::GPUPlace, ops::functor<float>>, \
-      ops::ActivationKernel<paddle::platform::GPUPlace,                       \
-                            ops::functor<double>>);                           \
-  REGISTER_OP_GPU_KERNEL(                                                     \
-      act_type##_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace,  \
-                                                 ops::grad_functor<float>>,   \
-      ops::ActivationGradKernel<paddle::platform::GPUPlace,                   \
+#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor)   \
+  REGISTER_OP_CUDA_KERNEL(                                                 \
+      act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext, \
+                                      ops::functor<float>>,                \
+      ops::ActivationKernel<paddle::platform::CUDADeviceContext,           \
+                            ops::functor<double>>);                        \
+  REGISTER_OP_CUDA_KERNEL(                                                 \
+      act_type##_grad,                                                     \
+      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
+                                ops::grad_functor<float>>,                 \
+      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
                                 ops::grad_functor<double>>);
 
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL);
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index ceb4b4e40b67473f42e67e3f02f8e012e1b1eb50..88c3d1c597a853abdee7753a5110be4a1726e905 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -1,71 +1,79 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class ActivationKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
   using T = typename Functor::ELEMENT_TYPE;
 
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Output<framework::Tensor>("Y");
-    Y->mutable_data<T>(context.GetPlace());
-
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto place = context.GetEigenDevice<Place>();
+    auto& X = detail::Ref(context.Input<framework::Tensor>("X"),
+                          "Cannot get input tensor X, variable name = %s",
+                          context.op().Input("X"));
+
+    auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"),
+                            "Cannot get output tensor Out, variable name = %s",
+                            context.op().Output("Out"));
+    Out.mutable_data<T>(context.GetPlace());
+    auto x = framework::EigenVector<T>::Flatten(X);
+    auto out = framework::EigenVector<T>::Flatten(Out);
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
 
     auto attrs = functor.GetAttrs();
     for (auto& attr : attrs) {
       *attr.second = context.Attr<float>(attr.first);
     }
-    functor(place, x, y);
+    functor(*place, x, out);
   }
 };
 
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class ActivationGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
   using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Input<framework::Tensor>("Y");
-    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+    auto* Out = context.Input<framework::Tensor>("Out");
+    auto* dOut =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
     dX->mutable_data<T>(context.GetPlace());
 
-    auto dy = framework::EigenVector<T>::Flatten(*dY);
+    auto dout = framework::EigenVector<T>::Flatten(*dOut);
     auto x = framework::EigenVector<T>::Flatten(*X);
-    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto out = framework::EigenVector<T>::Flatten(*Out);
     auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
     auto attrs = functor.GetAttrs();
     for (auto& attr : attrs) {
       *attr.second = context.Attr<float>(attr.first);
     }
-    functor(place, x, y, dy, dx);
+    functor(*place, x, out, dout, dx);
   }
 };
 
@@ -81,17 +89,18 @@ struct BaseActivationFunctor {
 // sigmoid(x) = 1 / (1 + exp(-x))
 template <typename T>
 struct SigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
   }
 };
 
 template <typename T>
 struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * y * (static_cast<T>(1) - y);
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out * (static_cast<T>(1) - out);
   }
 };
 
@@ -99,7 +108,7 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
 // For numerical stability, we can use the log-sum-exp trick:
 // https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
 // We can rewrite the above equation as:
-// y = -log( exp(0) + exp(-x)) [since exp(0) = 1]
+// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
 //   = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
 //   = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
 //           max(-x, 0)))
@@ -110,10 +119,10 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
 // + exp(-x - max(-x, 0))))
 template <typename T>
 struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
     auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
-    y.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
+    out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
   }
 };
 
@@ -122,62 +131,66 @@ struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
 // exp(-x - max(-x, 0)))
 template <typename T>
 struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
     dx.device(d) =
-        dy * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
+        dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
   }
 };
 
 // exp(x) = e^x
 template <typename T>
 struct ExpFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.exp();
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.exp();
   }
 };
 
 template <typename T>
 struct ExpGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * y;
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out;
   }
 };
 
 // relu(x) = max(x, 0)
 template <typename T>
 struct ReluFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.cwiseMax(static_cast<T>(0));
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(0));
   }
 };
 
 template <typename T>
 struct ReluGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>();
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>();
   }
 };
 
 // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
 struct TanhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.tanh();
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.tanh();
   }
 };
 
 template <typename T>
 struct TanhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * (static_cast<T>(1) - y * y);
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) - out * out);
   }
 };
 
@@ -185,17 +198,18 @@ struct TanhGradFunctor : public BaseActivationFunctor<T> {
 // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
 struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x - x.tanh();
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x - x.tanh();
   }
 };
 
 template <typename T>
 struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * (x.tanh() * x.tanh());
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (x.tanh() * x.tanh());
   }
 };
 
@@ -208,11 +222,11 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"threshold", &threshold}};
   }
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
     auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>().eval();
     auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
-    y.device(d) = x * (temp1 + temp2);
+    out.device(d) = x * (temp1 + temp2);
   }
 };
 
@@ -224,11 +238,12 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
     return {{"threshold", &threshold}};
   }
 
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>().eval();
     auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
-    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 };
 
@@ -241,12 +256,12 @@ struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
     return {{"lambda", &lambda}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
     auto lambdaT = static_cast<T>(lambda);
     auto temp1 = (x > lambdaT).template cast<T>().eval();
     auto temp2 = (x < -lambdaT).template cast<T>().eval();
-    y.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
+    out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
   }
 };
 
@@ -256,98 +271,140 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"lambda", &lambda}};
   }
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto lambdaT = static_cast<T>(lambda);
     auto temp1 = (x > lambdaT).template cast<T>().eval();
     auto temp2 = (x < -lambdaT).template cast<T>().eval();
-    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 };
 
 // sqrt(x) = x^(1/2)
 template <typename T>
 struct SqrtFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.sqrt();
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.sqrt();
   }
 };
 
 template <typename T>
 struct SqrtGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    const Y y_conj = Eigen::numext::conj(y);
-    dx.device(d) = static_cast<T>(0.5) * dy / y_conj;
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    const Out out_conj = Eigen::numext::conj(out);
+    dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
+  }
+};
+
+// ceil(x) = ceiling(x)
+template <typename T>
+struct CeilFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.ceil();
+  }
+};
+
+template <typename T>
+struct ZeroGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = static_cast<T>(0) / x;
+  }
+};
+
+// floor(x) = flooring(x)
+template <typename T>
+struct FloorFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.ceil();
+  }
+};
+
+// round(x) = [x]
+template <typename T>
+struct RoundFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.round();
   }
 };
 
 // abs(x) = |x|
 template <typename T>
 struct AbsFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.abs();
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.abs();
   }
 };
 
 template <typename T>
 struct AbsGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * x.sign();
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.sign();
   }
 };
 
 // reciprocal(x) = 1 / x
 template <typename T>
 struct ReciprocalFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = static_cast<T>(1) / x;
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = static_cast<T>(1) / x;
   }
 };
 
 template <typename T>
 struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * static_cast<T>(-1) * y * y;
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(-1) * out * out;
   }
 };
 
 // log(x) = natural logarithm of x
 template <typename T>
 struct LogFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.log();
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.log();
   }
 };
 
 template <typename T>
 struct LogGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * (static_cast<T>(1) / x);
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) / x);
   }
 };
 
 // square(x) = x^2
 template <typename T>
 struct SquareFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.square();
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.square();
   }
 };
 
 template <typename T>
 struct SquareGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * static_cast<T>(2) * x;
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(2) * x;
   }
 };
 
@@ -362,9 +419,9 @@ struct BReluFunctor : public BaseActivationFunctor<T> {
     return {{"t_min", &t_min}, {"t_max", &t_max}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) =
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
         x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
   }
 };
@@ -376,9 +433,10 @@ struct BReluGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"t_min", &t_min}, {"t_max", &t_max}};
   }
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy *
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
                    ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
                        .template cast<T>();
   }
@@ -393,9 +451,9 @@ struct Relu6Functor : public BaseActivationFunctor<T> {
     return {{"threshold", &threshold}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) =
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
         x.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(threshold));
   }
 };
@@ -406,9 +464,10 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"threshold", &threshold}};
   }
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy *
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
                    ((x > static_cast<T>(0)) * (x < static_cast<T>(threshold)))
                        .template cast<T>();
   }
@@ -421,10 +480,10 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
 // Then: softplus(x) = max(x, 0) + log(exp(-max(x, 0)) + exp(x - max(x, 0)))
 template <typename T>
 struct SoftplusFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) {
     auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
-    y.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log());
+    out.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log());
   }
 };
 
@@ -434,19 +493,21 @@ struct SoftplusFunctor : public BaseActivationFunctor<T> {
 // exp(x - max(x, 0)))
 template <typename T>
 struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
     auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
-    dx.device(d) = dy * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp()));
+    dx.device(d) =
+        dout * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp()));
   }
 };
 
 // softsign(x) = x / (1 + |x|)
 template <typename T>
 struct SoftsignFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
-    y.device(d) = x / (static_cast<T>(1) + x.abs());
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) {
+    out.device(d) = x / (static_cast<T>(1) + x.abs());
   }
 };
 
@@ -454,10 +515,11 @@ struct SoftsignFunctor : public BaseActivationFunctor<T> {
 // Taken from https://en.wikipedia.org/wiki/Activation_function
 template <typename T>
 struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
     dx.device(d) =
-        dy * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
+        dout * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
   }
 };
 
@@ -468,11 +530,11 @@ struct SoftReluFunctor : public BaseActivationFunctor<T> {
     return {{"threshold", &threshold}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
     auto tmp = static_cast<T>(threshold);
     auto temp = x.cwiseMax(-tmp).cwiseMin(tmp);
-    y.device(d) = (static_cast<T>(1) + temp.exp()).log();
+    out.device(d) = (static_cast<T>(1) + temp.exp()).log();
   }
 };
 
@@ -482,11 +544,12 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"threshold", &threshold}};
   }
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto tmp = static_cast<T>(threshold);
     auto temp = ((x > -tmp) * (x < tmp)).template cast<T>().eval();
-    dx.device(d) = dy * (static_cast<T>(1) - (-y).exp()) * temp;
+    dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
   }
 };
 
@@ -497,9 +560,9 @@ struct LeakyReluFunctor : public BaseActivationFunctor<T> {
     return {{"alpha", &alpha}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
   }
 };
 
@@ -509,12 +572,13 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"alpha", &alpha}};
   }
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp1 = static_cast<T>(alpha) *
                  (x < static_cast<T>(0)).template cast<T>().eval();
     auto temp2 = (x >= static_cast<T>(0)).template cast<T>().eval();
-    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 };
 
@@ -525,11 +589,11 @@ struct ELUFunctor : public BaseActivationFunctor<T> {
     return {{"alpha", &alpha}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.cwiseMax(static_cast<T>(0)) +
-                  (static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)))
-                      .cwiseMin(static_cast<T>(0));
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(0)) +
+                    (static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)))
+                        .cwiseMin(static_cast<T>(0));
   }
 };
 
@@ -539,10 +603,11 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"alpha", &alpha}};
   }
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>() +
-                   dy * (y + static_cast<T>(alpha)) *
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>() +
+                   dout * (out + static_cast<T>(alpha)) *
                        (x < static_cast<T>(0)).template cast<T>();
   }
 };
@@ -554,9 +619,9 @@ struct PowFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"factor", &factor}};
   }
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.pow(static_cast<T>(factor));
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.pow(static_cast<T>(factor));
   }
 };
 
@@ -566,9 +631,10 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"factor", &factor}};
   }
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * static_cast<T>(factor) *
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(factor) *
                    x.pow(static_cast<T>(factor - static_cast<T>(1)));
   }
 };
@@ -581,9 +647,9 @@ struct STanhFunctor : public BaseActivationFunctor<T> {
     return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
-    y.device(d) =
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
         static_cast<T>(scale_b) * (static_cast<T>(scale_a) * x).tanh();
   }
 };
@@ -596,12 +662,13 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
     return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
   }
 
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto a = static_cast<T>(scale_a);
     auto b = static_cast<T>(scale_b);
     auto temp = (a * x).tanh() * (a * x).tanh();
-    dx.device(d) = dy * a * b * (static_cast<T>(1) - temp);
+    dx.device(d) = dout * a * b * (static_cast<T>(1) - temp);
   }
 };
 
@@ -612,10 +679,10 @@ struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
     return {{"threshold", &threshold}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
     auto th = static_cast<T>(threshold);
-    y.device(d) = (x > th).template cast<T>() * x;
+    out.device(d) = (x > th).template cast<T>() * x;
   }
 };
 
@@ -626,10 +693,11 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
     return {{"threshold", &threshold}};
   }
 
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto th = static_cast<T>(threshold);
-    dx.device(d) = dy * (x > th).template cast<T>();
+    dx.device(d) = dout * (x > th).template cast<T>();
   }
 };
 
@@ -641,10 +709,11 @@ struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
     return {{"slope", &slope}, {"offset", &offset}};
   }
 
-  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) const {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
     auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
-    y.device(d) = temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
+    out.device(d) =
+        temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
   }
 };
 
@@ -656,12 +725,43 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
     return {{"slope", &slope}, {"offset", &offset}};
   }
 
-  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) =
-        dy *
-        ((y > static_cast<T>(0)) * (y < static_cast<T>(1))).template cast<T>() *
-        static_cast<T>(slope);
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
+                   ((out > static_cast<T>(0)) * (out < static_cast<T>(1)))
+                       .template cast<T>() *
+                   static_cast<T>(slope);
+  }
+};
+
+template <typename T>
+struct SwishFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x / (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+  }
+};
+
+template <typename T>
+struct SwishGradFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = static_cast<T>(1) /
+                 (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+    auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
+    dx.device(d) = dout * ((beta * out) + temp2);
   }
 };
 
@@ -677,6 +777,9 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
   __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
   __macro(abs, AbsFunctor, AbsGradFunctor);                          \
+  __macro(ceil, CeilFunctor, ZeroGradFunctor);                       \
+  __macro(floor, FloorFunctor, ZeroGradFunctor);                     \
+  __macro(round, RoundFunctor, ZeroGradFunctor);                     \
   __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
   __macro(log, LogFunctor, LogGradFunctor);                          \
   __macro(square, SquareFunctor, SquareGradFunctor);                 \
@@ -692,4 +795,5 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   __macro(elu, ELUFunctor, ELUGradFunctor);                          \
   __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor);    \
   __macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \
+  __macro(swish, SwishFunctor, SwishGradFunctor);                    \
   __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor);
diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc
index b717e1647e4b89285b841420650dc69e8a1e0c58..d8a9491c8247ac463e01606dac248780d5284236 100644
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -59,8 +59,7 @@ class AdadeltaOp : public framework::OperatorWithKernel {
 
 class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdadeltaOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  AdadeltaOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
@@ -92,12 +91,12 @@ for gradient descent.
 
 Adadelta updates are as follows:
 
-$$avgSquaredGradOut = \rho * avgSquaredGrad + (1 - \rho) * grad * grad \break
-paramUpdate =  - $\sqrt{((avgSquaredUpdate + \epsilon) /
-                       (avgSquaredGrad_out + \epsilon))}$ * grad \break
-avgSquaredUpdateOut = \rho * avgSquaredUpdate + (1 - \rho) *
-                                  {(paramUpdate)}^2 \break
-paramOut = param + paramUpdate$$
+$$
+avg\_squared\_grad\_out = \rho * avg\_squared\_grad + (1 - \rho) * grad * grad \\
+param\_update =  - \sqrt{\frac{avg\_squared\_update + \epsilon}{avg\_squared\_grad\_out + \epsilon}} * grad \\
+avg\_squared\_update\_out = \rho * avg\_squared\_update + (1 - \rho) * {param\_update}^2 \\
+param\_out = param + param\_update
+$$
 
 )DOC");
   }
@@ -109,4 +108,5 @@ paramOut = param + paramUpdate$$
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>);
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adadelta_op.cu b/paddle/operators/adadelta_op.cu
index 3af1c8c8e9861138a33b3156818f704c3b20363f..91294a0d5d148a43bb95ab83ae8176b475fde9de 100644
--- a/paddle/operators/adadelta_op.cu
+++ b/paddle/operators/adadelta_op.cu
@@ -1,20 +1,21 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/adadelta_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adadelta_op.h b/paddle/operators/adadelta_op.h
index d29e15c43583bd447fbacb548a326f303f7d1463..819d0845dbdafab95d993a455013300fa71495e2 100644
--- a/paddle/operators/adadelta_op.h
+++ b/paddle/operators/adadelta_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdadeltaOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -33,8 +33,8 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
     avg_squared_grad_out_tensor->mutable_data<T>(ctx.GetPlace());
     avg_squared_update_out_tensor->mutable_data<T>(ctx.GetPlace());
 
-    float rho = ctx.Attr<float>("rho");
-    float epsilon = ctx.Attr<float>("epsilon");
+    T rho = static_cast<T>(ctx.Attr<float>("rho"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
 
     auto param = framework::EigenVector<T>::Flatten(
         *ctx.Input<framework::Tensor>("Param"));
@@ -51,7 +51,7 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
         framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
     auto avg_squared_update_out =
         framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     avg_squared_grad_out.device(place) =
         rho * avg_squared_grad + (1 - rho) * grad.square();
diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc
index d6686e3ef3165976cf4c077a7a0f213082aa7716..c83318a272302a474c37ce86365201acf56b9cad 100644
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -59,8 +59,7 @@ class AdagradOp : public framework::OperatorWithKernel {
 
 class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdagradOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  AdagradOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
@@ -80,8 +79,8 @@ Adaptive Gradient Algorithm (Adagrad).
 
 The update is done as follows:
 
-$$momentOut = moment + grad * grad \break
-paramOut = param - learningRate * grad / ($\sqrt{momentOut}$ + \epsilon) \break
+$$moment\_out = moment + grad * grad \\
+param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
 $$
 
 The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
@@ -100,54 +99,24 @@ size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
 }  // namespace
 
 template <typename T>
-struct SparseAdagradFunctor<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& grad,
                   const framework::Tensor& learning_rate, T epsilon,
                   framework::Tensor* moment, framework::Tensor* param) {
     // 1. g_m.rows = set(g.rows)
-    auto grad_rows = grad.rows();
-    std::set<int64_t> row_set(grad_rows.begin(), grad_rows.end());
-    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
-
     auto grad_width = grad.value().dims()[1];
-    std::unique_ptr<framework::SelectedRows> grad_merge{
-        new framework::SelectedRows()};
-    grad_merge->set_rows(merge_rows);
-    grad_merge->set_height(grad.height());
-    grad_merge->mutable_value()->mutable_data<T>(
-        framework::make_ddim(
-            {static_cast<int64_t>(merge_rows.size()), grad_width}),
-        context.GetPlace());
-
-    math::SetConstant<platform::CPUPlace, T> constant_functor;
-    constant_functor(context, grad_merge->mutable_value(), 0.0);
-
-    auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
-    auto* grad_data = grad.value().data<T>();
-
-    for (size_t i = 0; i < grad_rows.size(); i++) {
-      size_t grad_merge_i = FindPos(merge_rows, grad_rows[i]);
-      for (int64_t j = 0; j < grad_width; j++) {
-        grad_merge_data[grad_merge_i * grad_width + j] +=
-            grad_data[i * grad_width + j];
-      }
-    }
+    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
+    auto grad_merge = merge_func(context, grad);
+    auto& merge_rows = grad_merge.rows();
+    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
 
     // 2. m += g_m * g_m
-    std::unique_ptr<framework::SelectedRows> grad_square{
-        new framework::SelectedRows()};
-    grad_square->set_rows(grad_merge->rows());
-    grad_square->set_height(grad_merge->height());
-    grad_square->mutable_value()->mutable_data<T>(grad_merge->value().dims(),
-                                                  context.GetPlace());
-    auto gs =
-        framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
-    auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
-    gs.device(*context.GetEigenDevice<platform::CPUPlace>()) = gm * gm;
-
-    math::SelectedRowsAddToTensor<platform::CPUPlace, T> functor;
-    functor(context, *grad_square, moment);
+    math::scatter::Mul<platform::CPUDeviceContext, T> sqare_func;
+    auto grad_square = sqare_func(context, grad_merge, grad_merge);
+
+    math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
+    functor(context, grad_square, moment);
 
     // 3. update parameter
     auto* lr = learning_rate.data<T>();
@@ -164,13 +133,13 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template struct SparseAdagradFunctor<platform::CPUPlace, float>;
-template struct SparseAdagradFunctor<platform::CPUPlace, double>;
+template struct SparseAdagradFunctor<platform::CPUDeviceContext, float>;
+template struct SparseAdagradFunctor<platform::CPUDeviceContext, double>;
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::CPUPlace, float>,
-    ops::AdagradOpKernel<paddle::platform::CPUPlace, double>);
+    adagrad, ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu
index 5b869e6bc5f4604ba6055ffd62fa21e4a1f41b93..4e579387924a5b0499f29609bc6b1322030a3c0d 100644
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -1,21 +1,21 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/adagrad_op.h"
-#include "paddle/operators/math/selected_rows_functor.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
 #include "paddle/platform/cuda_helper.h"
 
 namespace paddle {
@@ -72,80 +72,48 @@ __global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
 }  // namespace
 
 template <typename T>
-struct SparseAdagradFunctor<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& grad,
                   const framework::Tensor& learning_rate, T epsilon,
                   framework::Tensor* moment, framework::Tensor* param) {
     // 1. g_m.rows = set(g.rows)
-    auto grad_rows = grad.rows();
-    std::set<int64_t> row_set(grad_rows.begin(), grad_rows.end());
-    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
-
     auto grad_width = grad.value().dims()[1];
-    std::unique_ptr<framework::SelectedRows> grad_merge{
-        new framework::SelectedRows()};
-    grad_merge->set_rows(merge_rows);
-    grad_merge->set_height(grad.height());
-    grad_merge->mutable_value()->mutable_data<T>(
-        framework::make_ddim(
-            {static_cast<int64_t>(merge_rows.size()), grad_width}),
-        context.GetPlace());
-
-    math::SetConstant<platform::GPUPlace, T> constant_functor;
-    constant_functor(context, grad_merge->mutable_value(), 0.0);
-
-    auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
-    auto* grad_data = grad.value().data<T>();
-
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid1(1, grad_rows.size());
-
-    MergeGradKernel<
-        T, 256><<<grid1, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(grad_data, grad.rows().data(),
-                                   grad_merge_data, grad_merge->rows().data(),
-                                   grad_merge->rows().size(), grad_width);
-
+    math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
+    auto grad_merge = merge_func(context, grad);
+    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
+    auto& merge_rows = grad_merge.rows();
     // 2. m += g_m * g_m
-    std::unique_ptr<framework::SelectedRows> grad_square{
-        new framework::SelectedRows()};
-    grad_square->set_rows(grad_merge->rows());
-    grad_square->set_height(grad_merge->height());
-    grad_square->mutable_value()->mutable_data<T>(grad_merge->value().dims(),
-                                                  context.GetPlace());
-    auto gs =
-        framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
-    auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
-    gs.device(*context.GetEigenDevice<platform::GPUPlace>()) = gm * gm;
-
-    math::SelectedRowsAddToTensor<platform::GPUPlace, T> functor;
-    functor(context, *grad_square, moment);
+    math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
+    auto grad_square = sqare_func(context, grad_merge, grad_merge);
+
+    math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
+    functor(context, grad_square, moment);
 
     // 3. update parameter
     auto* lr = learning_rate.data<T>();
     auto* param_data = param->data<T>();
     auto* moment_data = moment->data<T>();
 
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
     dim3 grid2(1, merge_rows.size());
     SparseAdagradFunctorKernel<
         T, 256><<<grid2, threads, 0,
                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(grad_merge_data, grad_merge->rows().data(),
-                                   lr, param_data,
-                                   moment_data, grad_width, epsilon);
+                      .stream()>>>(grad_merge_data, grad_merge.rows().data(),
+                                   lr, param_data, moment_data, grad_width,
+                                   epsilon);
   }
 };
 
-template struct SparseAdagradFunctor<platform::GPUPlace, float>;
-template struct SparseAdagradFunctor<platform::GPUPlace, double>;
+template struct SparseAdagradFunctor<platform::CUDADeviceContext, float>;
+template struct SparseAdagradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::GPUPlace, float>,
-    ops::AdagradOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    adagrad, ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adagrad_op.h b/paddle/operators/adagrad_op.h
index 4d4a6434c7c472d8ceb01edfc4050fbb009d6c9f..66f5b0f449a4f11a3c734c98a6a97833763348a1 100644
--- a/paddle/operators/adagrad_op.h
+++ b/paddle/operators/adagrad_op.h
@@ -19,15 +19,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SparseAdagradFunctor {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::SelectedRows& grad,
                   const framework::Tensor& learning_rate, T epsilon,
                   framework::Tensor* moment, framework::Tensor* param);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -47,17 +47,24 @@ class AdagradOpKernel : public framework::OpKernel<T> {
           *ctx.Input<framework::Tensor>("Grad"));
       auto moment = framework::EigenVector<T>::Flatten(
           *ctx.Input<framework::Tensor>("Moment"));
-      auto lr = framework::EigenVector<T>::Flatten(
-          *ctx.Input<framework::Tensor>("LearningRate"));
+      auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
 
       auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
       auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-      auto place = ctx.GetEigenDevice<Place>();
+      auto* place = ctx.template device_context<DeviceContext>().eigen_device();
 
-      moment_out.device(place) = moment + grad * grad;
+      moment_out.device(*place) = moment + grad * grad;
       Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-      param_out.device(place) =
-          param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+      if (platform::is_cpu_place(ctx.GetPlace())) {
+        auto* lr = learning_rate->data<T>();
+        param_out.device(*place) =
+            param - lr[0] * grad / (moment_out.sqrt() + epsilon);
+      } else {
+        auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
+        param_out.device(*place) =
+            param -
+            lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+      }
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto* param_tensor = ctx.Input<framework::Tensor>("Param");
       PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor);
@@ -65,8 +72,9 @@ class AdagradOpKernel : public framework::OpKernel<T> {
       auto* moment_tensor = ctx.Input<framework::Tensor>("Moment");
       PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);
 
-      SparseAdagradFunctor<Place, T> functor;
-      functor(ctx.device_context(), *ctx.Input<framework::SelectedRows>("Grad"),
+      SparseAdagradFunctor<DeviceContext, T> functor;
+      functor(ctx.template device_context<DeviceContext>(),
+              *ctx.Input<framework::SelectedRows>("Grad"),
               *ctx.Input<framework::Tensor>("LearningRate"), epsilon,
               moment_out_tensor, param_out_tensor);
     } else {
diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc
index 97a091ae766abfba5412bbd32c34a6f80701fbf7..03527de936bf736d572fb0140033bde4db990981 100644
--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@@ -73,7 +73,7 @@ class AdamOp : public framework::OperatorWithKernel {
 
 class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdamOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  AdamOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
@@ -112,11 +112,13 @@ adaptive estimates of lower-order moments.
 
 Adam updates:
 
-$$moment_1_{out} = \beta_1 * moment_1 + (1 - \beta_1) * grad \break
-moment_2_{out} = \beta_2 * moment_2 + (1 - \beta_2) * grad * grad \break
-learningRate = learningRate *
-                  $\sqrt{(1 - \beta_2_{pow})}$ / (1 - \beta_1_{pow}) \break
-paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
+$$
+moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
+moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
+learning\_rate = learning\_rate *
+                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+$$
 
 )DOC");
   }
@@ -126,5 +128,6 @@ paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
-REGISTER_OP_CPU_KERNEL(adam,
-                       ops::AdamOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    adam, ops::AdamOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdamOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adam_op.cu b/paddle/operators/adam_op.cu
index a3def912e540454275350209435eb01ae2151331..94f840c188942a900858429bc621c3a18d5900ad 100644
--- a/paddle/operators/adam_op.cu
+++ b/paddle/operators/adam_op.cu
@@ -1,20 +1,21 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/adam_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(adam,
-                       ops::AdamOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    adam, ops::AdamOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdamOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h
index 45938006db1231a7a134964d729df6ca114d4dbe..9cc34bdded780e61e8700eb4fa4a295c84fb48bc 100644
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -13,59 +13,210 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
+#include <math.h>  // for sqrt in CPU and CUDA
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+namespace scatter = paddle::operators::math::scatter;
+
+template <typename T>
+struct AdamFunctor {
+  T beta1_;
+  T beta2_;
+  T epsilon_;
+
+  const T* beta1_pow_;
+  const T* beta2_pow_;
+  const T* moment1_;
+  T* moment1_out_;
+  const T* moment2_;
+  T* moment2_out_;
+  const T* lr_;
+  const T* grad_;
+  const T* param_;
+  T* param_out_;
+
+  AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
+              const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2,
+              T* mom2_out, const T* lr, const T* grad, const T* param,
+              T* param_out)
+      : beta1_(beta1),
+        beta2_(beta2),
+        epsilon_(epsilon),
+        beta1_pow_(beta1_pow),
+        beta2_pow_(beta2_pow),
+        moment1_(mom1),
+        moment1_out_(mom1_out),
+        moment2_(mom2),
+        moment2_out_(mom2_out),
+        lr_(lr),
+        grad_(grad),
+        param_(param),
+        param_out_(param_out) {}
+
+  inline HOSTDEVICE void operator()(size_t i) const {
+    // Merge all memory access together.
+    T g = grad_[i];
+    T mom1 = moment1_[i];
+    T mom2 = moment2_[i];
+    T lr = *lr_;
+    T beta1_pow = *beta1_pow_;
+    T beta2_pow = *beta2_pow_;
+    T p = param_[i];
+
+    // Calculation
+    lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
+    mom1 = beta1_ * mom1 + (1 - beta1_) * g;
+    mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
+    p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
+
+    // Write back to global memory
+    moment1_out_[i] = mom1;
+    moment2_out_[i] = mom2;
+    param_out_[i] = p;
+  }
+};
+
+template <typename T>
+struct SparseAdamFunctor {
+  T beta1_;
+  T beta2_;
+  T epsilon_;
+
+  const T* beta1_pow_;
+  const T* beta2_pow_;
+  const T* moment1_;
+  T* moment1_out_;
+  const T* moment2_;
+  T* moment2_out_;
+  const T* lr_;
+  const T* grad_;
+  const T* param_;
+  T* param_out_;
+
+  const int64_t* rows_;
+  int64_t row_numel_;
+
+  SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
+                    const T* beta2_pow, const T* mom1, T* mom1_out,
+                    const T* mom2, T* mom2_out, const T* lr, const T* grad,
+                    const T* param, T* param_out, const int64_t* rows,
+                    int64_t row_numel)
+      : beta1_(beta1),
+        beta2_(beta2),
+        epsilon_(epsilon),
+        beta1_pow_(beta1_pow),
+        beta2_pow_(beta2_pow),
+        moment1_(mom1),
+        moment1_out_(mom1_out),
+        moment2_(mom2),
+        moment2_out_(mom2_out),
+        lr_(lr),
+        grad_(grad),
+        param_(param),
+        param_out_(param_out),
+        rows_(rows),
+        row_numel_(row_numel) {}
+
+  inline HOSTDEVICE void operator()(size_t i) const {
+    T beta1_pow = *beta1_pow_;
+    T beta2_pow = *beta2_pow_;
+    for (int64_t j = 0; j < row_numel_; ++j) {
+      T g = grad_[i * row_numel_ + j];
+      T mom1 = moment1_[rows_[i] * row_numel_ + j];
+      T mom2 = moment2_[rows_[i] * row_numel_ + j];
+      T lr = *lr_;
+      T p = param_[rows_[i] * row_numel_ + j];
+
+      lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
+      mom1 = beta1_ * mom1 + (1 - beta1_) * g;
+      mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
+      p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
+
+      moment1_out_[rows_[i] * row_numel_ + j] = mom1;
+      moment2_out_[rows_[i] * row_numel_ + j] = mom2;
+      param_out_[rows_[i] * row_numel_ + j] = p;
+    }  // for col id
+  }
+};
+
+template <typename DeviceContext, typename T>
 class AdamOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto moment1_out_tensor = ctx.Output<framework::Tensor>("Moment1Out");
-    auto moment2_out_tensor = ctx.Output<framework::Tensor>("Moment2Out");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    moment1_out_tensor->mutable_data<T>(ctx.GetPlace());
-    moment2_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    float beta1 = ctx.Attr<float>("beta1");
-    float beta2 = ctx.Attr<float>("beta2");
-    float epsilon = ctx.Attr<float>("epsilon");
-
-    auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
-    auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
-    auto moment1 = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Moment1"));
-    auto moment2 = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Moment2"));
-    auto lr = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("LearningRate"));
-    auto beta1_pow = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Beta1Pow"));
-    auto beta2_pow = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Beta2Pow"));
-    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-    auto moment1_out = framework::EigenVector<T>::Flatten(*moment1_out_tensor);
-    auto moment2_out = framework::EigenVector<T>::Flatten(*moment2_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
-
-    moment1_out.device(place) = beta1 * moment1 + (1 - beta1) * grad;
-    moment2_out.device(place) = beta2 * moment2 + (1 - beta2) * grad.square();
-
-    // All of these are tensors of 1 element
-    auto lr_t = lr * (1 - beta2_pow).sqrt() / (1 - beta1_pow);
-    // Eigen does not support automatic broadcast
-    // Get dimensions of moment vector to broadcast lr_t
-    Eigen::DSizes<int, 1> m_dsize(moment1_out_tensor->numel());
-    param_out.device(place) =
-        param -
-        lr_t.broadcast(m_dsize) *
-            (moment1_out / (moment2_out.sqrt() + epsilon));
+    using paddle::framework::LoDTensor;
+    using paddle::operators::detail::Ref;
+
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    auto& param = Ref(ctx.Input<LoDTensor>("Param"), "Must set Param");
+    // auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
+    auto* grad_var = ctx.InputVar("Grad");
+    auto& mom1 = Ref(ctx.Input<LoDTensor>("Moment1"), "Must set Moment1");
+    auto& mom2 = Ref(ctx.Input<LoDTensor>("Moment2"), "Must set Moment2");
+    auto& lr =
+        Ref(ctx.Input<LoDTensor>("LearningRate"), "Must set LearningRate");
+
+    auto& beta1_pow =
+        Ref(ctx.Input<LoDTensor>("Beta1Pow"), "Must set Beta1Pow");
+    auto& beta2_pow =
+        Ref(ctx.Input<LoDTensor>("Beta2Pow"), "Must set Beta2Pow");
+
+    auto& param_out =
+        Ref(ctx.Output<LoDTensor>("ParamOut"), "Must set ParamOut");
+    auto& mom1_out =
+        Ref(ctx.Output<LoDTensor>("Moment1Out"), "Must set Moment1Out");
+    auto& mom2_out =
+        Ref(ctx.Output<LoDTensor>("Moment2Out"), "Must set Moment1Out");
+
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
+      AdamFunctor<T> functor(
+          beta1, beta2, epsilon, beta1_pow.template data<T>(),
+          beta2_pow.template data<T>(), mom1.template data<T>(),
+          mom1_out.template mutable_data<T>(ctx.GetPlace()),
+          mom2.template data<T>(),
+          mom2_out.template mutable_data<T>(ctx.GetPlace()),
+          lr.template data<T>(), grad.template data<T>(),
+          param.template data<T>(),
+          param_out.template mutable_data<T>(ctx.GetPlace()));
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(ctx.device_context()),
+          param.numel());
+      for_range(functor);
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      auto& grad =
+          Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
+      // merge duplicated rows if any.
+      scatter::MergeAdd<DeviceContext, T> merge_func;
+      auto grad_merge =
+          merge_func(ctx.template device_context<DeviceContext>(), grad);
+      auto& grad_tensor = grad_merge.value();
+      const T* grad_data = grad_tensor.template data<T>();
+      auto* rows = grad_merge.rows().data();
+      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
+
+      SparseAdamFunctor<T> functor(
+          beta1, beta2, epsilon, beta1_pow.template data<T>(),
+          beta2_pow.template data<T>(), mom1.template data<T>(),
+          mom1_out.template mutable_data<T>(ctx.GetPlace()),
+          mom2.template data<T>(),
+          mom2_out.template mutable_data<T>(ctx.GetPlace()),
+          lr.template data<T>(), grad_data, param.template data<T>(),
+          param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel);
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(ctx.device_context()),
+          grad_merge.rows().size());
+      for_range(functor);
+    } else {
+      PADDLE_THROW("Variable type not supported by adam_op");
+    }
   }
 };
 
diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc
index 14cf3841b33a8153549e4c99ed2b75286e9c64db..3b0b71418477ea128dbb31a8d7cd44cf6bf023a1 100644
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@@ -67,7 +67,7 @@ class AdamaxOp : public framework::OperatorWithKernel {
 
 class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdamaxOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  AdamaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
@@ -107,10 +107,12 @@ Adam algorithm based on the infinity norm.
 
 Adamax updates:
 
-$$momentOut = \beta_1 * moment + (1 - \beta_1) * grad \break
-infNormOut = max(\beta_2 * infNorm + \epsilon, |grad|) \break
-learningRate = learningRate /(1 - \beta_1_{pow}) \break
-paramOut = param - learningRate * momentPut / infNormOut$$
+$$
+moment\_out = \beta_1 * moment + (1 - \beta_1) * grad \\
+inf\_norm\_out = max(\beta_2 * inf\_norm + \epsilon, |grad|) \\
+learning\_rate = \frac{learning\_rate}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out}
+$$
 
 The original paper does not have an epsilon attribute.
 However, it is added here for numerical stability to prevent the
@@ -125,5 +127,6 @@ division by 0 error.
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
-REGISTER_OP_CPU_KERNEL(adamax,
-                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    adamax, ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adamax_op.cu b/paddle/operators/adamax_op.cu
index fee3b6fc6b656917d79b84f48da8e63be7683890..8f87bb28671018a184f25a014f9bdb7615f3040c 100644
--- a/paddle/operators/adamax_op.cu
+++ b/paddle/operators/adamax_op.cu
@@ -1,20 +1,21 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/adamax_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(adamax,
-                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    adamax, ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h
index 2c99832ec08e9c1d9b5458c467d5238f9b1b3c37..172c179c5fabf5ca106bf11479aff2d94a4e21d2 100644
--- a/paddle/operators/adamax_op.h
+++ b/paddle/operators/adamax_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdamaxOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -31,9 +31,9 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
     moment_out_tensor->mutable_data<T>(ctx.GetPlace());
     inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());
 
-    float beta1 = ctx.Attr<float>("beta1");
-    float beta2 = ctx.Attr<float>("beta2");
-    float epsilon = ctx.Attr<float>("epsilon");
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
 
     auto param = framework::EigenVector<T>::Flatten(
         *ctx.Input<framework::Tensor>("Param"));
@@ -51,14 +51,14 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
     auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
     auto inf_norm_out =
         framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
 
-    moment_out.device(place) = beta1 * moment + (1 - beta1) * grad;
-    inf_norm_out.device(place) =
+    moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad;
+    inf_norm_out.device(*place) =
         grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
     auto lr_t = lr / (1 - beta1_pow);
     Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-    param_out.device(place) =
+    param_out.device(*place) =
         param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
   }
 };
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
index 233a81198e336d3190565fb18556f96979cec0ce..3fdad5ad9b1246e06bbb71e582a96c212de1b0d5 100644
--- a/paddle/operators/array_operator.h
+++ b/paddle/operators/array_operator.h
@@ -1,20 +1,21 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -27,16 +28,21 @@ class ArrayOp : public framework::OperatorBase {
 
  protected:
   size_t GetOffset(const framework::Scope &scope,
-                   const platform::DeviceContext &dev_ctx) const {
+                   const platform::Place &place) const {
     auto *i = scope.FindVar(Input("I"));
     PADDLE_ENFORCE(i != nullptr, "I must be set");
     auto &i_tensor = i->Get<framework::LoDTensor>();
     PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
     size_t offset;
     if (platform::is_gpu_place(i_tensor.place())) {
       // FIXME: Avoid copy from GPU to CPU
       framework::Tensor t;
-      t.CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx);
+      framework::Copy(i_tensor, platform::CPUPlace(), dev_ctx, &t);
       dev_ctx.Wait();
       offset = static_cast<size_t>(*t.data<int64_t>());
     } else {
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
index c0903bb4e5ca7f160e19eefab99af7e3e4a8ed76..ba5c6bd3c681b4ae4f612da96df866227961df3d 100644
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -1,21 +1,23 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <numeric>
+
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -30,7 +32,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
                      const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
     auto &rank_table =
         scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
@@ -102,8 +104,14 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
         if (len == 0) {
           continue;
         }
-        out->Slice(out_offset, out_offset + len)
-            .CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, dev_ctx);
+        auto slice = out->Slice(out_offset, out_offset + len);
+
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(place);
+
+        framework::Copy(x[x_idx].Slice(start_offset, end_offset), place,
+                        dev_ctx, &slice);
         out_offset += len;
       }
     }
@@ -113,8 +121,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
 
 class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ArrayToLoDTensorOpProtoMaker(framework::OpProto *proto,
-                               framework::OpAttrChecker *op_checker)
+  ArrayToLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(std::vector<LodTensor>) A vector of tensors that is going to "
@@ -149,14 +156,14 @@ class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("lod_tensor_to_array");
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetInput("RankTable", Input("RankTable"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc
index 609e915b932e2bc4d5abee1e5f868cc07a7619d3..e04aa2d28cff7b106b30304bfa19ba18e2affd21 100644
--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
@@ -1,20 +1,21 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/var_type.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -43,7 +44,8 @@ class AssignFunctor {
     out_rows.set_rows(rows.rows());
     out_rows.set_height(rows.height());
     auto &t = rows.value();
-    out_rows.mutable_value()->CopyFrom(t, t.place(), dev_ctx_);
+    auto *m = out_rows.mutable_value();
+    framework::Copy(t, t.place(), dev_ctx_, m);
   }
 
   template <typename T>
@@ -55,7 +57,7 @@ class AssignFunctor {
   void copy_tensor(const framework::LoDTensor &lod_tensor,
                    framework::LoDTensor *out) const {
     auto &out_tensor = *out;
-    out_tensor.CopyFrom(lod_tensor, lod_tensor.place(), dev_ctx_);
+    Copy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
     out_tensor.set_lod(lod_tensor.lod());
   }
 
@@ -70,7 +72,7 @@ class AssignOp : public framework::OperatorBase {
            const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
     if (x == nullptr) {
       return;
@@ -79,14 +81,17 @@ class AssignOp : public framework::OperatorBase {
     PADDLE_ENFORCE(
         out != nullptr,
         "The Output(Out) should not be null if the Input(X) is set.");
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
     framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
   }
 };
 
 class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AssignOpProtoMaker(framework::OpProto *proto,
-                     framework::OpAttrChecker *op_checker)
+  AssignOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
@@ -108,8 +113,8 @@ class AssignInferShape : public framework::InferShapeBase {
   void operator()(framework::InferShapeContext *context) const override {
     if (context->HasInput("X")) {
       auto type = context->GetInputsVarType("X")[0];
-      if (type == framework::VarDesc_VarType_SELECTED_ROWS ||
-          type == framework::VarDesc_VarType_LOD_TENSOR) {
+      if (type == framework::proto::VarDesc_VarType_SELECTED_ROWS ||
+          type == framework::proto::VarDesc_VarType_LOD_TENSOR) {
         context->SetOutputDim("Out", context->GetInputDim("X"));
       }
     }
@@ -121,12 +126,12 @@ class AssignGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
     op->SetType("assign");
     op->SetInput("X", OutputGrad("Out"));
     op->SetOutput("Out", InputGrad("X"));
-    return std::unique_ptr<framework::OpDescBind>(op);
+    return std::unique_ptr<framework::OpDesc>(op);
   }
 };
 
diff --git a/paddle/operators/assign_value_op.cc b/paddle/operators/assign_value_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d5671c1183a0f58d2aedb0723bd462684ac5636e
--- /dev/null
+++ b/paddle/operators/assign_value_op.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/assign_value_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AssignValueOp : public framework::OperatorWithKernel {
+ public:
+  AssignValueOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of AssignValueOp should not be null.");
+    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    ctx->SetOutputDim("Out", framework::make_ddim(shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::DataType(ctx.Attr<int>("dtype")), ctx.GetPlace());
+  }
+};
+
+class AssignValueOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AssignValueOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "(Tensor) Output tensor of assign_value operator.");
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) "
+                              "Shape of values.");
+    AddAttr<int>("dtype", "data type of values")
+        .InEnum({framework::proto::DataType::INT32,
+                 framework::proto::DataType::FP32});
+    AddAttr<std::vector<float>>("fp32_values", "store the float values")
+        .SetDefault({});
+    AddAttr<std::vector<int>>("int32_values", "store the int values")
+        .SetDefault({});
+    AddComment(R"DOC(
+AssignValue operator
+
+$$Out = values$$
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker);
+REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel<int>,
+                       ops::AssignValueKernel<float>);
diff --git a/paddle/operators/assign_value_op.cu.cc b/paddle/operators/assign_value_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b17e20150053cea4c6b9ed6a5f222f77f4a4bd36
--- /dev/null
+++ b/paddle/operators/assign_value_op.cu.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/assign_value_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
+                        ops::AssignValueKernel<float>);
diff --git a/paddle/operators/assign_value_op.h b/paddle/operators/assign_value_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..db2e43077999fa0f9aaada74026dd701ab2bf464
--- /dev/null
+++ b/paddle/operators/assign_value_op.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class AssignValueKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto shape = ctx.Attr<std::vector<int>>("shape");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    int dtype = ctx.Attr<int>("dtype");
+    const char* value_name = nullptr;
+    switch (dtype) {
+      case framework::proto::DataType::INT32:
+        value_name = "int32_values";
+        break;
+      case framework::proto::DataType::FP32:
+        value_name = "fp32_values";
+        break;
+      default:
+        PADDLE_THROW("Unsupported dtype for assign_value_op: %d", dtype);
+        break;
+    }
+    auto values = ctx.Attr<std::vector<T>>(value_name);
+    framework::CopyFromVector(values, ctx.device_context(), out);
+    out->Resize(framework::make_ddim(shape));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/auc_op.cc b/paddle/operators/auc_op.cc
index 6c3f67ec32fb1b942241997e87a1e9c4752e707d..b6494f95097bdc87081950815e910beda5d6850d 100644
--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
@@ -39,7 +39,7 @@ class AucOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
@@ -49,7 +49,7 @@ class AucOp : public framework::OperatorWithKernel {
 
 class AucOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AucOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  AucOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Out",
              "A floating point 2D tensor, values are in the range [0, 1]."
diff --git a/paddle/operators/auc_op.h b/paddle/operators/auc_op.h
index e5ac57b038ac32ed35bce35e477ede0cdb5da813..b80509e2a99a2a255dff2a98d950257588a21d29 100644
--- a/paddle/operators/auc_op.h
+++ b/paddle/operators/auc_op.h
@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AucKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index f884e6efa917ce3f8554dce0e248f2b29273e3f3..0e984c38ba78bddc232ce43bd0982408e837abe3 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/batch_norm_op.h"
+#include "paddle/framework/data_layout.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
 
 template <typename T>
 using EigenArrayMap =
@@ -48,10 +50,6 @@ class BatchNormOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), "");
     PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), "");
 
-    const float epsilon = ctx->Attrs().Get<float>("epsilon");
-    PADDLE_ENFORCE_GE(epsilon, 0.0, "epsilon should be larger than 0");
-    PADDLE_ENFORCE_LE(epsilon, 0.001, "epsilon should not be too large");
-
     // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
     PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
                       "Mean and MeanOut should share the same memory");
@@ -60,14 +58,15 @@ class BatchNormOp : public framework::OperatorWithKernel {
                       "Variance and VarianceOut should share the same memory");
 
     const auto x_dims = ctx->GetInputDim("X");
-    const TensorFormat tensor_format =
-        StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
-    const int C =
-        (tensor_format == TensorFormat::NCHW ? x_dims[1]
-                                             : x_dims[x_dims.size() - 1]);
+    const DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
+
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "Input X must have 2 to 5 dimensions.");
 
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "Input X must have 3 to 5 dimensions.");
+    const int64_t C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
 
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
@@ -79,18 +78,23 @@ class BatchNormOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("VarianceOut", {C});
     ctx->SetOutputDim("SavedMean", {C});
     ctx->SetOutputDim("SavedVariance", {C});
+    ctx->ShareLoD("X", "Y");
   }
 };
 
 class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BatchNormOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  BatchNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddAttr<bool>("is_test", "").SetDefault(false);
     AddAttr<float>("momentum", "").SetDefault(0.9);
-    AddAttr<float>("epsilon", "").SetDefault(1e-5);
-    AddAttr<std::string>("tensor_format", "").SetDefault("NCHW");
+    AddAttr<float>("epsilon", "")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
+                         "'epsilon' should be between 0.0 and 0.001.");
+        });
+    AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
     AddInput("X", "The input tensor");
     AddInput("Scale",
              "Scale is a 1-dimensional tensor of size C "
@@ -134,24 +138,25 @@ The required data format for this layer is one of the following:
 };
 
 template <typename T>
-class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
+class BatchNormKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const float epsilon = ctx.Attr<float>("epsilon");
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
-    const std::string tensor_format_str =
-        ctx.Attr<std::string>("tensor_format");
-    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     const int N = x_dims[0];
     const int C =
-        (tensor_format == TensorFormat::NCHW ? x_dims[1]
-                                             : x_dims[x_dims.size() - 1]);
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
     const int sample_size = x->numel() / N / C;
 
     auto *y = ctx.Output<Tensor>("Y");
@@ -176,8 +181,8 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
       saved_mean_e.setZero();
       saved_variance_e.setZero();
 
-      switch (tensor_format) {
-        case TensorFormat::NCHW: {
+      switch (data_layout) {
+        case DataLayout::kNCHW: {
           ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
           for (int nc = 0; nc < N * C; ++nc) {
             saved_mean_e(nc % C) += x_arr.col(nc).sum();
@@ -190,7 +195,7 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
           saved_variance_e /= N * sample_size;
           break;
         }
-        case TensorFormat::NHWC: {
+        case DataLayout::kNHWC: {
           ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
           for (int i = 0; i < N * sample_size; ++i) {
             saved_mean_e += x_arr.col(i);
@@ -204,7 +209,7 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
           break;
         }
         default:
-          PADDLE_THROW("Unknown storage order: %s", tensor_format_str);
+          PADDLE_THROW("Unknown storage order: %s", data_layout_str);
       }
 
       EigenVectorArrayMap<T> running_mean_arr(
@@ -246,8 +251,8 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
     Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
         bias_arr - mean_arr * inv_std * scale_arr;
 
-    switch (tensor_format) {
-      case TensorFormat::NCHW: {
+    switch (data_layout) {
+      case DataLayout::kNCHW: {
         EigenArrayMap<T> y_arr(y->mutable_data<T>(ctx.GetPlace()), sample_size,
                                N * C);
         ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
@@ -256,7 +261,7 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
         }
         break;
       }
-      case TensorFormat::NHWC: {
+      case DataLayout::kNHWC: {
         EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C,
                          N * sample_size) =
             (ConstEigenArrayMap<T>(x->data<T>(), C, N * sample_size).colwise() *
@@ -266,7 +271,7 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
         break;
       }
       default:
-        PADDLE_THROW("Unknown storage order: %d", tensor_format);
+        PADDLE_THROW("Unknown storage order: %d", data_layout);
     }
   }
 };
@@ -289,11 +294,11 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), "");
 
     const auto x_dims = ctx->GetInputDim("X");
-    const TensorFormat tensor_format =
-        StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
+    const DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
     const int C =
-        (tensor_format == TensorFormat::NCHW ? x_dims[1]
-                                             : x_dims[x_dims.size() - 1]);
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
 
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
     ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
@@ -301,7 +306,7 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     const auto *var = ctx.InputVar(framework::GradVarName("Y"));
     if (var == nullptr) {
@@ -317,12 +322,12 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
       PADDLE_THROW("can't find Y@GRAD");
     }
     return framework::OpKernelType(framework::ToDataType(t->type()),
-                                   ctx.device_context());
+                                   ctx.GetPlace());
   }
 };
 
 template <typename T>
-class BatchNormGradKernel<platform::CPUPlace, T>
+class BatchNormGradKernel<platform::CPUDeviceContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -332,19 +337,19 @@ class BatchNormGradKernel<platform::CPUPlace, T>
     const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
     // SavedVariance have been reverted in forward operator
     const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
-    const std::string tensor_format_str =
-        ctx.Attr<std::string>("tensor_format");
-    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
 
     // Get the size for each dimension.
     // NCHW [batch_size, in_channels, in_height, in_width]
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     const int N = x_dims[0];
     const int C =
-        (tensor_format == TensorFormat::NCHW ? x_dims[1]
-                                             : x_dims[x_dims.size() - 1]);
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
     const int sample_size = x->numel() / N / C;
 
     ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
@@ -375,8 +380,8 @@ class BatchNormGradKernel<platform::CPUPlace, T>
 
     const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);
 
-    switch (tensor_format) {
-      case TensorFormat::NCHW: {
+    switch (data_layout) {
+      case DataLayout::kNCHW: {
         ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
         ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
         EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()),
@@ -399,7 +404,7 @@ class BatchNormGradKernel<platform::CPUPlace, T>
         }
         break;
       }
-      case TensorFormat::NHWC: {
+      case DataLayout::kNHWC: {
         ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
         ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
         EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C,
@@ -424,7 +429,7 @@ class BatchNormGradKernel<platform::CPUPlace, T>
         break;
       }
       default:
-        PADDLE_THROW("Unknown storage order: %s", tensor_format_str);
+        PADDLE_THROW("Unknown storage order: %s", data_layout_str);
     }
   }
 };
@@ -435,8 +440,9 @@ class BatchNormGradKernel<platform::CPUPlace, T>
 namespace ops = paddle::operators;
 REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
             batch_norm_grad, ops::BatchNormGradOp);
-REGISTER_OP_CPU_KERNEL(batch_norm,
-                       ops::BatchNormKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    batch_norm,
+    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::CPUPlace, float>);
+    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc
index 726d1ea1b8d7ced93f94bb0e5bb4df9e43b0ac7b..3d17725ab47682355b2093782848849857f9bf59 100644
--- a/paddle/operators/batch_norm_op.cu.cc
+++ b/paddle/operators/batch_norm_op.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/batch_norm_op.h"
+#include "paddle/framework/data_layout.h"
 
 #include <cfloat>
 #include "paddle/operators/math/math_function.h"
@@ -22,44 +23,52 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 
-void ExtractNCWHD(const framework::DDim &dims,
-                  const TensorFormat &tensor_format, int *N, int *C, int *H,
-                  int *W, int *D) {
+void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout,
+                  int *N, int *C, int *H, int *W, int *D) {
   *N = dims[0];
-  *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
-  *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
-  *W = dims.size() > 3
-           ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
-           : 1;
-  *D = dims.size() > 4
-           ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
-           : 1;
+  if (dims.size() == 2) {
+    *C = dims[1];
+    *H = 1;
+    *W = 1;
+    *D = 1;
+  } else {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+    *W = dims.size() > 3
+             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
+             : 1;
+    *D = dims.size() > 4
+             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
+             : 1;
+  }
 }
 
 template <typename T>
-class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
+class BatchNormKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
     double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
-    const std::string tensor_format_str =
-        ctx.Attr<std::string>("tensor_format");
-    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
 
     // Get the size for each dimension.
     // NCHW [batch_size, in_channels, in_height, in_width]
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     int N, C, H, W, D;
-    ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
     // ------------------- cudnn descriptors ---------------------
     cudnnTensorDescriptor_t data_desc_;
@@ -85,7 +94,7 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
     VLOG(1) << "Setting descriptors.";
     std::vector<int> dims;
     std::vector<int> strides;
-    if (tensor_format == TensorFormat::NCHW) {
+    if (data_layout == DataLayout::kNCHW) {
       dims = {N, C, H, W, D};
       strides = {C * H * W * D, H * W * D, W * D, D, 1};
     } else {
@@ -114,11 +123,12 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
     saved_mean->mutable_data<T>(ctx.GetPlace());
     saved_variance->mutable_data<T>(ctx.GetPlace());
 
-    math::SetConstant<platform::GPUPlace, T> functor;
-    functor(ctx.device_context(), saved_mean, 0);
-    functor(ctx.device_context(), saved_variance, 0);
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    math::SetConstant<platform::CUDADeviceContext, T> functor;
+    functor(dev_ctx, saved_mean, 0);
+    functor(dev_ctx, saved_variance, 0);
 
-    auto handle = ctx.cuda_device_context().cudnn_handle();
+    auto handle = dev_ctx.cudnn_handle();
 
     // Now, depending on whether we are running test or not, we have two paths.
     if (is_test) {
@@ -164,26 +174,26 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
 };
 
 template <typename T>
-class BatchNormGradKernel<platform::GPUPlace, T>
+class BatchNormGradKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
     double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const std::string tensor_format_str =
-        ctx.Attr<std::string>("tensor_format");
-    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
     const auto *x = ctx.Input<Tensor>("X");
     const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
     const auto *scale = ctx.Input<Tensor>("Scale");
 
     const auto &x_dims = x->dims();
 
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     int N, C, H, W, D;
-    ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
     PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
     PADDLE_ENFORCE_EQ(scale->dims()[0], C);
@@ -210,7 +220,7 @@ class BatchNormGradKernel<platform::GPUPlace, T>
 
     std::vector<int> dims;
     std::vector<int> strides;
-    if (tensor_format == TensorFormat::NCHW) {
+    if (data_layout == DataLayout::kNCHW) {
       dims = {N, C, H, W, D};
       strides = {C * H * W * D, H * W * D, W * D, D, 1};
     } else {
@@ -237,11 +247,12 @@ class BatchNormGradKernel<platform::GPUPlace, T>
     const void *saved_mean_data = saved_mean->template data<T>();
     const void *saved_var_data = saved_var->template data<T>();
 
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
-        ctx.cuda_device_context().cudnn_handle(), mode_,
-        CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
-        CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(), data_desc_,
-        x->template data<T>(), data_desc_, d_y->template data<T>(), data_desc_,
+        dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
+        data_desc_, d_y->template data<T>(), data_desc_,
         d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
         scale->template data<T>(),
         d_scale->template mutable_data<T>(ctx.GetPlace()),
@@ -259,8 +270,9 @@ class BatchNormGradKernel<platform::GPUPlace, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(batch_norm,
-                       ops::BatchNormKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    batch_norm,
+    ops::BatchNormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::GPUPlace, float>);
+    ops::BatchNormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/batch_norm_op.h b/paddle/operators/batch_norm_op.h
index 4e80134a1acf3b4d66154453dd0ed709133d1c7c..a817ef41fc87da33ad87923c99a75ee7c3c7bbfe 100644
--- a/paddle/operators/batch_norm_op.h
+++ b/paddle/operators/batch_norm_op.h
@@ -19,28 +19,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-enum TensorFormat {
-  NHWC = 0,
-  NCHW = 1,
-};
-
-inline TensorFormat StringToTensorFormat(const std::string& str) {
-  if (str == "NHWC" || str == "nhwc") {
-    return TensorFormat::NHWC;
-  } else if (str == "NCHW" || str == "nchw") {
-    return TensorFormat::NCHW;
-  } else {
-    PADDLE_THROW("Unknown storage order string: %s", str);
-  }
-}
-
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BatchNormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BatchNormGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override;
diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc
index 3904a97d58166cfeeb2be7d2144700dbd8bc5721..72e05607b0b612807d552b4c45b58f9d9ce9c2af 100644
--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -13,10 +13,41 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/beam_search_decode_op.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
 
+struct BeamSearchDecodeFunctor {
+  BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
+                          const LoDTensorArray& step_scores,
+                          LoDTensor* id_tensor, LoDTensor* score_tensor)
+      : step_ids_(step_ids),
+        step_scores_(step_scores),
+        id_tensor_(id_tensor),
+        score_tensor_(score_tensor) {}
+
+  template <typename T>
+  void operator()() const;
+
+  const LoDTensorArray& step_ids_;
+  const LoDTensorArray& step_scores_;
+  LoDTensor* id_tensor_;
+  LoDTensor* score_tensor_;
+};
+
+template <typename T>
+void BeamSearchDecodeFunctor::operator()() const {
+  BeamSearchDecoder<T> beam_search_decoder;
+  beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_,
+                                   score_tensor_);
+}
+
+template <>
+void BeamSearchDecodeFunctor::operator()<bool>() const {
+  PADDLE_THROW("beam search decode op does not support bool!");
+}
+
 class BeamSearchDecodeOp : public framework::OperatorBase {
  public:
   BeamSearchDecodeOp(const std::string& type,
@@ -25,7 +56,10 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
                      const framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& dev_place) const override {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& dev_ctx = *pool.Get(dev_place);
+
     framework::ExecutionContext ctx(*this, scope, dev_ctx);
 
     const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
@@ -45,17 +79,16 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
     LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds");
     LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
 
-    BeamSearchDecoder<float> beam_search_decoder;
-    beam_search_decoder.PackAllSteps(*ids, *scores, sentenceIds,
-                                     sentenceScores);
+    framework::VisitDataType(
+        framework::ToDataType(scores->at(0).type()),
+        BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores));
   }
 };
 
 class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BeamSearchDecodeOpProtoMaker(framework::OpProto* proto,
-                               framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  BeamSearchDecodeOpProtoMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Ids",
              "(LodTensorArray)"
              "score of the candidate words in each step");
@@ -90,13 +123,13 @@ class BeamSearchDecodeInferShape : public framework::InferShapeBase {
 
 class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind& op_desc,
-                  framework::BlockDescBind* block) const override {
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
     for (auto& o : op_desc.Output("SentenceIds")) {
-      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
     }
     for (auto& o : op_desc.Output("SentenceScores")) {
-      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/operators/beam_search_decode_op.h b/paddle/operators/beam_search_decode_op.h
index 0f007ec22f9a66572971516a711317f348e1ec5a..3b1c6cd7a1045bfbb896725c79dc1ae2e22f43dc 100644
--- a/paddle/operators/beam_search_decode_op.h
+++ b/paddle/operators/beam_search_decode_op.h
@@ -232,12 +232,12 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
   id_tensor->set_lod(lod);
   id_tensor->Resize({static_cast<int64_t>(id_data.size())});
   id_tensor->mutable_data<int64_t>(paddle::platform::CPUPlace());
-  id_tensor->CopyFromVector<int64_t>(id_data, cpu_ctx);
+  framework::CopyFromVector<int64_t>(id_data, cpu_ctx, id_tensor);
 
   score_tensor->set_lod(lod);
   score_tensor->Resize({static_cast<int64_t>(score_data.size())});
   score_tensor->mutable_data<T>(paddle::platform::CPUPlace());
-  score_tensor->CopyFromVector<T>(score_data, cpu_ctx);
+  framework::CopyFromVector<T>(score_data, cpu_ctx, score_tensor);
 }
 
 template <typename T>
diff --git a/paddle/operators/beam_search_op.cc b/paddle/operators/beam_search_op.cc
index 17926a813d5b0b8ace6a1b20066cd0007703c696..ed2e7b738acd81140467ff22ad077155bba2fde1 100644
--- a/paddle/operators/beam_search_op.cc
+++ b/paddle/operators/beam_search_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/beam_search_op.h"
 
@@ -39,7 +39,7 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
 
   std::map<size_t /*offset*/, std::vector<Item>> hash;
   framework::LoD new_lod;
-  auto *ids_data = selected_ids->mutable_data<int>(platform::CPUPlace());
+  auto *ids_data = selected_ids->mutable_data<int64_t>(platform::CPUPlace());
   auto *scores_data =
       selected_scores->mutable_data<float>(platform::CPUPlace());
 
@@ -66,7 +66,7 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
 
 void BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
                                       std::vector<std::vector<Item>> *items) {
-  auto *pre_ids_data = pre_ids.data<int>();
+  auto *pre_ids_data = pre_ids.data<int64_t>();
 
   for (size_t offset = 0; offset < items->size(); offset++) {
     auto prefix_id = pre_ids_data[offset];
@@ -127,7 +127,7 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
   auto abs_lod = framework::ToAbsOffset(ids.lod());
   PADDLE_ENFORCE_GE(source_abs_two_level_lod.size(), 2UL);
 
-  auto *ids_data = ids.data<int>();
+  auto *ids_data = ids.data<int64_t>();
   auto *scores_data = scores.data<float>();
 
   size_t instance_dim = 1;
@@ -139,7 +139,7 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
   items->reserve(framework::product(ids.dims()));
   for (size_t offset = abs_lod[lod_level_][sent_offset_];
        offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
-    for (int d = 0; d < instance_dim; d++) {
+    for (size_t d = 0; d < instance_dim; d++) {
       const size_t dim_offset = offset * instance_dim + d;
       items->emplace_back(offset, ids_data[dim_offset],
                           scores_data[dim_offset]);
@@ -153,8 +153,7 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
 class BeamSearchProtoAndCheckerMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  BeamSearchProtoAndCheckerMaker(framework::OpProto *proto,
-                                 framework::OpAttrChecker *op_checker)
+  BeamSearchProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     // inputs and outputs stored in proto
     AddInput("pre_ids", "ids in previous step");
diff --git a/paddle/operators/beam_search_op.h b/paddle/operators/beam_search_op.h
index cc556bfe42ab12d73c0eb503d033efc272b5dd68..08b551ef9bd63106ed222d3a956a912294f827ec 100644
--- a/paddle/operators/beam_search_op.h
+++ b/paddle/operators/beam_search_op.h
@@ -189,7 +189,7 @@ class BeamSearchOp : public framework::OperatorBase {
   }
 
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& dev_place) const override {
     LOG(INFO) << "run beam search op";
     auto ids_var = scope.FindVar(Input("ids"));
     auto scores_var = scope.FindVar(Input("scores"));
diff --git a/paddle/operators/bilinear_tensor_product_op.cc b/paddle/operators/bilinear_tensor_product_op.cc
index c65ba7eb262f3aabe2c00837b79806c0b40b60fd..7640147a12d66a924f16eaf168227b6ce6a96040 100644
--- a/paddle/operators/bilinear_tensor_product_op.cc
+++ b/paddle/operators/bilinear_tensor_product_op.cc
@@ -65,8 +65,7 @@ class BilinearTensorProductOp : public framework::OperatorWithKernel {
 
 class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BilinearTensorProductOpMaker(framework::OpProto* proto,
-                               framework::OpAttrChecker* op_checker)
+  BilinearTensorProductOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The first input of bilinear_tensor_product operator.");
     AddInput("Y", "The second input of bilinear_tensor_product operator.");
@@ -77,11 +76,19 @@ class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "The output of bilinear_tensor_product operator.");
     AddComment(R"DOC(
 Bilinear Tensor Product operator.
-Given input X and Y, a 3D tensor weight, and bias. Each column of the
-output is computed by one slice i = 1, . . . , k of the tensor:
+Given input X and Y, a 3D tensor Weight and a Bias. Each column of the
+Output is computed by one slice $i = 1, . . . , k$ of the tensor:
 
-    M =  (X W_i) \cdot Y
-    Out_i = \sum_i {M_i} + Bias_i
+$$
+M =  (X W_i) * Y \\
+Out_i = \sum_j {M_j} + Bias_i
+$$
+
+Where $W_i$ is the $i$-th slice of Input(Weight);
+      $M_j$ is the $j$-th column of $M$;
+      $Out_i$ is the $i$-th column of Output(Out);
+      $Bias_i$ is a column vector, each element of it is equal to
+        the $i$-th element of $Bias$;
 
 )DOC");
   }
@@ -151,9 +158,12 @@ REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
             ops::BilinearTensorProductOpGrad);
 REGISTER_OP_CPU_KERNEL(
     bilinear_tensor_product,
-    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, float>,
-    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, double>);
+    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
 REGISTER_OP_CPU_KERNEL(
     bilinear_tensor_product_grad,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, float>,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, double>);
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
+                                         float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
+                                         double>);
diff --git a/paddle/operators/bilinear_tensor_product_op.cu b/paddle/operators/bilinear_tensor_product_op.cu
index 858d2668d01379afe8082cd1eda32a2a5d09bd18..0f48010716f086a64c0b6a35b76e06a42430ab84 100644
--- a/paddle/operators/bilinear_tensor_product_op.cu
+++ b/paddle/operators/bilinear_tensor_product_op.cu
@@ -16,11 +16,15 @@ limitations under the License. */
 #include "paddle/operators/bilinear_tensor_product_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     bilinear_tensor_product,
-    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, float>,
-    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
+    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
+                                     double>);
+REGISTER_OP_CUDA_KERNEL(
     bilinear_tensor_product_grad,
-    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, float>,
-    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, double>);
+    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
+                                         float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
+                                         double>);
diff --git a/paddle/operators/bilinear_tensor_product_op.h b/paddle/operators/bilinear_tensor_product_op.h
index 1113a4c6f357edb4f6b14b73c6eec9c6cca24ce5..ba9a2c5ce3c024a82e864a399ad90281d8dcdb20 100644
--- a/paddle/operators/bilinear_tensor_product_op.h
+++ b/paddle/operators/bilinear_tensor_product_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BilinearTensorProductKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -46,7 +46,8 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
     int out_dim = weight_dims[0];
     auto x_dim = weight_dims[1];
     auto y_dim = weight_dims[2];
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
     // Create the intermediate variable to caculate the result of
     // Input(X) multiplied by Input(Weight_i), the formula is:
@@ -60,9 +61,9 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
       auto output_col_vec = output_mat.chip(i, 1);
       Tensor weight_mat =
           weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
-      math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
-                           batch_size, y_dim, x_dim, 1, x->data<T>(),
-                           weight_mat.data<T>(), 0, left_mul.data<T>());
+      math::gemm<DeviceContext, T>(dev_ctx, CblasNoTrans, CblasNoTrans,
+                                   batch_size, y_dim, x_dim, 1, x->data<T>(),
+                                   weight_mat.data<T>(), 0, left_mul.data<T>());
       output_col_vec.device(place) =
           (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
     }
@@ -74,7 +75,7 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -96,8 +97,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
     auto x_mat = EigenMatrix<T>::From(*x);
     auto y_mat = EigenMatrix<T>::From(*y);
     auto d_out_mat = EigenMatrix<T>::From(*d_out);
-    auto place = ctx.GetEigenDevice<Place>();
-
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
     // Create the intermediate variable to caculate the Output(Y@Grad).
     Tensor x_scale;
     x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}),
@@ -110,18 +111,18 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
                             ctx.GetPlace());
     auto y_scale_mat = EigenMatrix<T>::From(y_scale);
 
-    math::SetConstant<Place, T> set_zero;
+    math::SetConstant<DeviceContext, T> set_zero;
 
     // Set Output(X@Grad) be zero.
     if (d_x) {
       d_x->mutable_data<T>(ctx.GetPlace());
-      set_zero(ctx.device_context(), d_x, static_cast<T>(0));
+      set_zero(dev_ctx, d_x, static_cast<T>(0));
     }
 
     // Set Output(Y@Grad) be zero.
     if (d_y) {
       d_y->mutable_data<T>(ctx.GetPlace());
-      set_zero(ctx.device_context(), d_y, static_cast<T>(0));
+      set_zero(dev_ctx, d_y, static_cast<T>(0));
     }
 
     // Caculate the Output(X@Grad) and Output(Y@Grad).
@@ -137,18 +138,18 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
               output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                   .broadcast(bcast_for_x) *
               y_mat;
-          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasTrans,
-                               batch_size, x_dim, y_dim, 1, y_scale.data<T>(),
-                               weight_i.data<T>(), 1, d_x->data<T>());
+          math::gemm<DeviceContext, T>(
+              dev_ctx, CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
+              y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
         }
         if (d_y) {
           x_scale_mat.device(place) =
               output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                   .broadcast(bcast_for_y) *
               x_mat;
-          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
-                               batch_size, y_dim, x_dim, 1, x_scale.data<T>(),
-                               weight_i.data<T>(), 1, d_y->data<T>());
+          math::gemm<DeviceContext, T>(
+              dev_ctx, CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
+              x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
         }
       }
     }
@@ -165,9 +166,9 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
             output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                 .broadcast(bcast_for_weight) *
             x_mat;
-        math::gemm<Place, T>(ctx.device_context(), CblasTrans, CblasNoTrans,
-                             x_dim, y_dim, batch_size, 1, x_scale.data<T>(),
-                             y->data<T>(), 0, d_weight_i.data<T>());
+        math::gemm<DeviceContext, T>(dev_ctx, CblasTrans, CblasNoTrans, x_dim,
+                                     y_dim, batch_size, 1, x_scale.data<T>(),
+                                     y->data<T>(), 0, d_weight_i.data<T>());
       }
     }
 
diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc
index 70ee7861bab3a982eae60dd85b10c2e41f5827d0..446976edafca56f3c56fe573c8b5ef76a333089f 100644
--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/cast_op.h"
 #include "paddle/framework/op_registry.h"
@@ -20,13 +20,12 @@ namespace operators {
 
 class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CastOpProtoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  CastOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of cast op");
     AddOutput("Out", "The output tensor of cast op");
-    AddAttr<int>("out_data_type", "output data type");
-    AddAttr<int>("in_data_type", "input data type");
+    AddAttr<int>("out_dtype", "output data type");
+    AddAttr<int>("in_dtype", "input data type");
     AddComment(R"DOC(
 Cast Operator.
 
@@ -53,14 +52,14 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto grad = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto grad = new framework::OpDesc();
     grad->SetType("cast");
     grad->SetInput("X", OutputGrad("Out"));
     grad->SetOutput("Out", InputGrad("X"));
-    grad->SetAttr("out_data_type", GetAttr("in_data_type"));
-    grad->SetAttr("in_data_type", GetAttr("out_data_type"));
-    return std::unique_ptr<framework::OpDescBind>(grad);
+    grad->SetAttr("out_dtype", GetAttr("in_dtype"));
+    grad->SetAttr("in_dtype", GetAttr("out_dtype"));
+    return std::unique_ptr<framework::OpDesc>(grad);
   }
 };
 
@@ -68,10 +67,11 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUPlace;
+using CPU = paddle::platform::CPUDeviceContext;
 REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape,
                         ops::CastOpProtoMaker);
 REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
                        ops::CastOpKernel<CPU, double>,
                        ops::CastOpKernel<CPU, int>,
-                       ops::CastOpKernel<CPU, int64_t>);
+                       ops::CastOpKernel<CPU, int64_t>,
+                       ops::CastOpKernel<CPU, bool>);
diff --git a/paddle/operators/cast_op.cu b/paddle/operators/cast_op.cu
index fb75ddbabfefd8d00420d8c96f958abcb8fdce62..d68bbe6e39a2fbaa92787731145ae324288b981a 100644
--- a/paddle/operators/cast_op.cu
+++ b/paddle/operators/cast_op.cu
@@ -1,22 +1,23 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/cast_op.h"
 
 template <typename T>
 using CastOpKernel =
-    paddle::operators::CastOpKernel<paddle::platform::GPUPlace, T>;
+    paddle::operators::CastOpKernel<paddle::platform::CUDADeviceContext, T>;
 
-REGISTER_OP_GPU_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
-                       CastOpKernel<int>, CastOpKernel<int64_t>);
+REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
+                        CastOpKernel<int>, CastOpKernel<int64_t>,
+                        CastOpKernel<bool>);
diff --git a/paddle/operators/cast_op.h b/paddle/operators/cast_op.h
index ffdbff7030afedab2efc06479ac86ad70c185f48..9f39d91edd49d236d74019ca81b42002e4f35d36 100644
--- a/paddle/operators/cast_op.h
+++ b/paddle/operators/cast_op.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -27,13 +27,13 @@ struct CastOpTransformFunctor {
   HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
 };
 
-template <typename Place, typename InT>
+template <typename DeviceContext, typename InT>
 struct CastOpFunctor {
   const framework::Tensor* in_;
   framework::Tensor* out_;
-  const platform::DeviceContext& ctx_;
+  const DeviceContext& ctx_;
   CastOpFunctor(const framework::Tensor* in, framework::Tensor* out,
-                const platform::DeviceContext& ctx)
+                const DeviceContext& ctx)
       : in_(in), out_(out), ctx_(ctx) {}
 
   template <typename OutT>
@@ -42,21 +42,22 @@ struct CastOpFunctor {
     auto numel = in_->numel();
     auto* in_end = in_begin + numel;
     auto* out_begin = out_->mutable_data<OutT>(ctx_.GetPlace());
-    platform::Transform<Place> trans;
+    platform::Transform<DeviceContext> trans;
     trans(ctx_, in_begin, in_end, out_begin,
           CastOpTransformFunctor<InT, OutT>());
   }
 };
 
-template <typename Place, typename InT>
+template <typename DeviceContext, typename InT>
 class CastOpKernel : public framework::OpKernel<InT> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
     framework::VisitDataType(
-        static_cast<framework::DataType>(context.Attr<int>("out_data_type")),
-        CastOpFunctor<Place, InT>(in, out, context.device_context()));
+        static_cast<framework::proto::DataType>(context.Attr<int>("out_dtype")),
+        CastOpFunctor<DeviceContext, InT>(
+            in, out, context.template device_context<DeviceContext>()));
   }
 };
 
diff --git a/paddle/operators/chunk_eval_op.cc b/paddle/operators/chunk_eval_op.cc
index 309660b01fe7052de2f9300acdf00779d0228221..44f667aead9ac88fee57310e06e3192732a8d908 100644
--- a/paddle/operators/chunk_eval_op.cc
+++ b/paddle/operators/chunk_eval_op.cc
@@ -32,6 +32,13 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
                    "Output(Recall) of ChunkEvalOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("F1-Score"),
                    "Output(F1-Score) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("NumInferChunks"),
+                   "Output(NumInferChunks) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("NumLabelChunks"),
+                   "Output(NumLabelChunks) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NumCorrectChunks"),
+        "Output(NumCorrectChunks) of ChunkEvalOp should not be null.");
 
     auto inference_dim = ctx->GetInputDim("Inference");
     auto label_dim = ctx->GetInputDim("Label");
@@ -42,25 +49,28 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Precision", {1});
     ctx->SetOutputDim("Recall", {1});
     ctx->SetOutputDim("F1-Score", {1});
+    ctx->SetOutputDim("NumInferChunks", {1});
+    ctx->SetOutputDim("NumLabelChunks", {1});
+    ctx->SetOutputDim("NumCorrectChunks", {1});
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(framework::DataType::FP32,
-                                   ctx.device_context());
+    return framework::OpKernelType(framework::proto::DataType::FP32,
+                                   platform::CPUPlace());
   }
 };
 
 class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ChunkEvalOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ChunkEvalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Inference",
-             "(Tensor, default: Tensor<int>). Predictions from the network.");
+             "(Tensor, default: Tensor<int64_t>). "
+             "Predictions from the network.");
     AddInput("Label",
-             "(Tensor, default: Tensor<int>). The true tag sequences.");
+             "(Tensor, default: Tensor<int64_t>). The true tag sequences.");
     AddOutput("Precision",
               "(float). The evaluated precision (called positive predictive "
               "value) of chunks on the given mini-batch.");
@@ -69,6 +79,16 @@ class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
               "sensitivity) of chunks on the given mini-batch.");
     AddOutput("F1-Score",
               "(float). The evaluated F1-Score on the given mini-batch.");
+    AddOutput("NumInferChunks",
+              "(int64_t). The number of chunks in Inference on the given "
+              "mini-batch.");
+    AddOutput(
+        "NumLabelChunks",
+        "(int64_t). The number of chunks in Label on the given mini-batch.");
+    AddOutput(
+        "NumCorrectChunks",
+        "(int64_t). The number of chunks both in Inference and Label on the "
+        "given mini-batch.");
     AddAttr<int>("num_chunk_types",
                  "(int). The number of chunk type. See below for details.");
     AddAttr<std::string>(
@@ -84,7 +104,7 @@ class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(std::vector<int>{});
     AddComment(R"DOC(
 For some basics of chunking, please refer to
-‘Chunking with Support Vector Mechines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>’.
+‘Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>’.
 
 
 CheckEvalOp computes the precision, recall, and F1-score of chunk detection,
@@ -97,7 +117,7 @@ Here is a NER example of labeling for these tagging schemes:
   IOE:   I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
   IOBES: B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
 
-There are three chunk types(named entity types) including PER(person), ORG(orgnazation)
+There are three chunk types(named entity types) including PER(person), ORG(organization)
 and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
 
 Since the calculations actually use label ids rather than labels, extra attention
diff --git a/paddle/operators/chunk_eval_op.h b/paddle/operators/chunk_eval_op.h
index 81aa07817b673b2ff85a35a51cc43742b7ad7fed..300aff90c0af666a95b7d4a0329de709e48ceddb 100644
--- a/paddle/operators/chunk_eval_op.h
+++ b/paddle/operators/chunk_eval_op.h
@@ -23,7 +23,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ChunkEvalKernel : public framework::OpKernel<T> {
  public:
   struct Segment {
@@ -35,10 +35,10 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     }
   };
 
-  void GetSegments(const int* label, int length, std::vector<Segment>& segments,
-                   int num_chunk_types, int num_tag_types, int other_chunk_type,
-                   int tag_begin, int tag_inside, int tag_end,
-                   int tag_single) const {
+  void GetSegments(const int64_t* label, int length,
+                   std::vector<Segment>& segments, int num_chunk_types,
+                   int num_tag_types, int other_chunk_type, int tag_begin,
+                   int tag_inside, int tag_end, int tag_single) const {
     segments.clear();
     segments.reserve(length);
     int chunk_start = 0;
@@ -111,9 +111,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     std::vector<Segment> label_segments;
     std::vector<Segment> output_segments;
     std::set<int> excluded_chunk_types;
-    int64_t num_output_segments = 0;
-    int64_t num_label_segments = 0;
-    int64_t num_correct = 0;
+
     if (context.Attr<std::string>("chunk_scheme") == "IOB") {
       num_tag_types = 2;
       tag_begin = 0;
@@ -147,16 +145,29 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
         context.Attr<std::vector<int>>("excluded_chunk_types").end());
 
     auto* inference = context.Input<LoDTensor>("Inference");
+    auto place = inference->place();
     auto* label = context.Input<LoDTensor>("Label");
     auto* precision = context.Output<Tensor>("Precision");
     auto* recall = context.Output<Tensor>("Recall");
     auto* f1 = context.Output<Tensor>("F1-Score");
-
-    const int* inference_data = inference->data<int>();
-    const int* label_data = label->data<int>();
-    T* precision_data = precision->mutable_data<T>(context.GetPlace());
-    T* racall_data = recall->mutable_data<T>(context.GetPlace());
-    T* f1_data = f1->mutable_data<T>(context.GetPlace());
+    auto* num_infer_chunks = context.Output<Tensor>("NumInferChunks");
+    auto* num_label_chunks = context.Output<Tensor>("NumLabelChunks");
+    auto* num_correct_chunks = context.Output<Tensor>("NumCorrectChunks");
+
+    const int64_t* inference_data = inference->data<int64_t>();
+    const int64_t* label_data = label->data<int64_t>();
+    T* precision_data = precision->mutable_data<T>(place);
+    T* racall_data = recall->mutable_data<T>(place);
+    T* f1_data = f1->mutable_data<T>(place);
+    int64_t* num_infer_chunks_data =
+        num_infer_chunks->mutable_data<int64_t>(place);
+    int64_t* num_label_chunks_data =
+        num_label_chunks->mutable_data<int64_t>(place);
+    int64_t* num_correct_chunks_data =
+        num_correct_chunks->mutable_data<int64_t>(place);
+    *num_infer_chunks_data = 0;
+    *num_label_chunks_data = 0;
+    *num_correct_chunks_data = 0;
 
     auto lod = label->lod();
     PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
@@ -166,20 +177,26 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     for (int i = 0; i < num_sequences; ++i) {
       int seq_length = lod[0][i + 1] - lod[0][i];
       EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
-                 output_segments, label_segments, num_output_segments,
-                 num_label_segments, num_correct, num_chunk_types,
-                 num_tag_types, other_chunk_type, tag_begin, tag_inside,
-                 tag_end, tag_single, excluded_chunk_types);
+                 output_segments, label_segments, *num_infer_chunks_data,
+                 *num_label_chunks_data, *num_correct_chunks_data,
+                 num_chunk_types, num_tag_types, other_chunk_type, tag_begin,
+                 tag_inside, tag_end, tag_single, excluded_chunk_types);
     }
-    *precision_data = !num_output_segments ? 0 : static_cast<T>(num_correct) /
-                                                     num_output_segments;
-    *racall_data = !num_label_segments ? 0 : static_cast<T>(num_correct) /
-                                                 num_label_segments;
-    *f1_data = !num_correct ? 0 : 2 * (*precision_data) * (*racall_data) /
-                                      ((*precision_data) + (*racall_data));
+    *precision_data = !(*num_infer_chunks_data)
+                          ? 0
+                          : static_cast<T>(*num_correct_chunks_data) /
+                                (*num_infer_chunks_data);
+    *racall_data = !(*num_label_chunks_data)
+                       ? 0
+                       : static_cast<T>(*num_correct_chunks_data) /
+                             (*num_label_chunks_data);
+    *f1_data = !(*num_correct_chunks_data)
+                   ? 0
+                   : 2 * (*precision_data) * (*racall_data) /
+                         ((*precision_data) + (*racall_data));
   }
 
-  void EvalOneSeq(const int* output, const int* label, int length,
+  void EvalOneSeq(const int64_t* output, const int64_t* label, int length,
                   std::vector<Segment>& output_segments,
                   std::vector<Segment>& label_segments,
                   int64_t& num_output_segments, int64_t& num_label_segments,
diff --git a/paddle/operators/clip_by_norm_op.cc b/paddle/operators/clip_by_norm_op.cc
index d9fc532e39500fa397be80396b075e866bad9362..b90921d79baa920f0b6f92cde2f7e1ca9183d0d2 100644
--- a/paddle/operators/clip_by_norm_op.cc
+++ b/paddle/operators/clip_by_norm_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/clip_by_norm_op.h"
 
@@ -37,8 +37,7 @@ class ClipByNormOp : public framework::OperatorWithKernel {
 
 class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ClipByNormOpMaker(framework::OpProto* proto,
-                    framework::OpAttrChecker* op_checker)
+  ClipByNormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor) The input of clip_by_norm op."
@@ -47,15 +46,19 @@ class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
               "(Tensor) The output of clip_by_norm op with shape as input(X)");
     AddAttr<float>("max_norm", "(float) The maximum norm value.");
     AddComment(R"DOC(
-ClipByNorm operator limits the L2 norm of the input 'X' within 'max_norm'. 
-If the L2 norm of 'X' is less than or equal to 'max_norm', 'Out' will be 
-the same as 'X'. If the L2 norm of 'X' is greater than 'max_norm', 'X' will 
-be linearly scaled to make the L2 norm of 'Out' equal to 'max_norm', as 
-shown in the following formula：
+ClipByNorm Operator.
 
-'Out' = 'max_norm' * 'X' / norm('X'),
+This operator limits the L2 norm of the input $X$ within $max\_norm$.
+If the L2 norm of $X$ is less than or equal to $max\_norm$, $Out$ will be
+the same as $X$. If the L2 norm of $X$ is greater than $max\_norm$, $X$ will
+be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as
+shown in the following formula:
 
-where norm('X') represents the L2 norm of 'X'.
+$$
+Out = \frac{max\_norm * X}{norm(X)},
+$$
+
+where $norm(X)$ represents the L2 norm of $X$.
 )DOC");
   }
 };
@@ -67,4 +70,5 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp,
                              ops::ClipByNormOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    clip_by_norm, ops::ClipByNormKernel<paddle::platform::CPUPlace, float>);
+    clip_by_norm,
+    ops::ClipByNormKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/clip_by_norm_op.cu b/paddle/operators/clip_by_norm_op.cu
index 2593a24ebbf56ecd286a726e527d2414247576e8..cbf8fa44133739f948fed13e18fc5cbaabd3abb0 100644
--- a/paddle/operators/clip_by_norm_op.cu
+++ b/paddle/operators/clip_by_norm_op.cu
@@ -1,19 +1,20 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/clip_by_norm_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    clip_by_norm, ops::ClipByNormKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    clip_by_norm,
+    ops::ClipByNormKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/clip_by_norm_op.h b/paddle/operators/clip_by_norm_op.h
index b26476cae9b5b2fa290bc9186b9a64c48ba703d6..87956a707cf58afa2336602b8ab6acf73b0ff814 100644
--- a/paddle/operators/clip_by_norm_op.h
+++ b/paddle/operators/clip_by_norm_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -26,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ClipByNormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -38,7 +38,8 @@ class ClipByNormKernel : public framework::OpKernel<T> {
     auto x = EigenVector<T>::Flatten(*input);
     auto out = EigenVector<T>::Flatten(*output);
     auto x_norm = x.square().sum().sqrt();
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto temp = (x_norm <= max_norm).template cast<T>().eval();
     auto scaling = temp + (static_cast<T>(1) - temp) * max_norm / x_norm;
diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index 3e9066ceb2a4a4dc19fdf5ef02bb7fadaab4bfff..573bb9c7dfdac2366c2458dd9f27a035a9f9b813 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/clip_op.h"
 
@@ -38,7 +38,7 @@ class ClipOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ClipOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor)The input of clip op."
@@ -52,7 +52,11 @@ class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
 Clip Operator.
 
 The clip operator limits the value of given input within an interval. The interval is
-specified with arguments 'min' and 'max'.
+specified with arguments 'min' and 'max':
+
+$$
+Out = \min(\max(X, min), max)
+$$
 
 )DOC");
   }
@@ -79,7 +83,7 @@ class ClipOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
             ops::ClipOpGrad);
-REGISTER_OP_CPU_KERNEL(clip,
-                       ops::ClipKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(clip_grad,
-                       ops::ClipGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/clip_op.cu b/paddle/operators/clip_op.cu
index ca9701298fdae3fabe234925edaf9e4d775cc66e..5ccbc9643407c65e8734711744ceac9814f4c6a2 100644
--- a/paddle/operators/clip_op.cu
+++ b/paddle/operators/clip_op.cu
@@ -1,21 +1,21 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/clip_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(clip,
-                       ops::ClipKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(clip_grad,
-                       ops::ClipGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    clip, ops::ClipKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    clip_grad, ops::ClipGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/clip_op.h b/paddle/operators/clip_op.h
index ac702e9935201ba5263a80ebeb1ab22fa0bd1340..51db185dffd80cc3b839d063acaf3f936d732817 100644
--- a/paddle/operators/clip_op.h
+++ b/paddle/operators/clip_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -55,7 +55,7 @@ class ClipGradFunctor {
   T max_;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ClipKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -66,13 +66,13 @@ class ClipKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
     const T* x_data = x->data<T>();
     int64_t numel = x->numel();
-    Transform<Place> trans;
-    trans(context.device_context(), x_data, x_data + numel, out_data,
-          ClipFunctor<T>(min, max));
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x_data,
+          x_data + numel, out_data, ClipFunctor<T>(min, max));
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ClipGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -86,9 +86,9 @@ class ClipGradKernel : public framework::OpKernel<T> {
       auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
       const T* d_out_data = d_out->data<T>();
       const T* x_data = x->data<T>();
-      Transform<Place> trans;
-      trans(context.device_context(), d_out_data, d_out_data + numel, x_data,
-            d_x_data, ClipGradFunctor<T>(min, max));
+      Transform<DeviceContext> trans;
+      trans(context.template device_context<DeviceContext>(), d_out_data,
+            d_out_data + numel, x_data, d_x_data, ClipGradFunctor<T>(min, max));
     }
   }
 };
diff --git a/paddle/operators/compare_op.cc b/paddle/operators/compare_op.cc
index bf7e88368157d29e627c3c06384f28b6e5e4ecc1..daa2c193b48fe216ff284169a3dce1b4cd40a791 100644
--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/compare_op.h"
 #include "paddle/framework/op_registry.h"
@@ -20,8 +20,7 @@ namespace operators {
 template <typename OpComment>
 class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CompareOpProtoMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  CompareOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     OpComment comment;
     AddInput("X",
@@ -67,9 +66,9 @@ class CompareOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = OperatorWithKernel::GetKernelType(ctx);
+    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
     // CompareOp kernel's device type is decided by input tensor place
     kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
     return kt;
diff --git a/paddle/operators/compare_op.cu b/paddle/operators/compare_op.cu
index 6ac8c124b9b2e7c808808ecc8802a2e5aeaa5b5d..26049271befd1fe57001659d1a406e73de0004a7 100644
--- a/paddle/operators/compare_op.cu
+++ b/paddle/operators/compare_op.cu
@@ -1,23 +1,23 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/compare_op.h"
 
-REGISTER_LOGICAL_KERNEL(less_than, GPU, paddle::operators::LessThanFunctor);
-REGISTER_LOGICAL_KERNEL(less_equal, GPU, paddle::operators::LessEqualFunctor);
-REGISTER_LOGICAL_KERNEL(greater_than, GPU,
+REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_KERNEL(greater_than, CUDA,
                         paddle::operators::GreaterThanFunctor);
-REGISTER_LOGICAL_KERNEL(greater_equal, GPU,
+REGISTER_LOGICAL_KERNEL(greater_equal, CUDA,
                         paddle::operators::GreaterEqualFunctor);
-REGISTER_LOGICAL_KERNEL(equal, GPU, paddle::operators::EqualFunctor);
+REGISTER_LOGICAL_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h
index afdf3ab3e098b4e7f4c996471617d97ec49264b1..567e89c0a727ad0cdd2add8ec8b2a42c86a58007 100644
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <math.h>
@@ -59,7 +59,7 @@ struct EqualFunctor {
   }
 };
 
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class CompareOpKernel
     : public framework::OpKernel<typename Functor::ELEM_TYPE> {
  public:
@@ -69,24 +69,23 @@ class CompareOpKernel
     auto* y = context.Input<framework::Tensor>("Y");
     auto* out = context.Output<framework::Tensor>("Out");
     Functor binary_func;
-    platform::Transform<Place> trans;
-    trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
-          y->data<T>(), out->mutable_data<bool>(context.GetPlace()),
-          binary_func);
+    platform::Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x->data<T>(),
+          x->data<T>() + x->numel(), y->data<T>(),
+          out->mutable_data<bool>(context.GetPlace()), binary_func);
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor)                     \
-  REGISTER_OP_##dev##_KERNEL(                                              \
-      op_type,                                                             \
-      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
-                                           functor<int>>,                  \
-      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
-                                           functor<int64_t>>,              \
-      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
-                                           functor<float>>,                \
-      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
-                                           functor<double>>);
+#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor)                    \
+  REGISTER_OP_##dev##_KERNEL(                                             \
+      op_type, ::paddle::operators::CompareOpKernel<                      \
+                   ::paddle::platform::dev##DeviceContext, functor<int>>, \
+      ::paddle::operators::CompareOpKernel<                               \
+          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,      \
+      ::paddle::operators::CompareOpKernel<                               \
+          ::paddle::platform::dev##DeviceContext, functor<float>>,        \
+      ::paddle::operators::CompareOpKernel<                               \
+          ::paddle::platform::dev##DeviceContext, functor<double>>);
diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
index 5f052689251bc023df635d41c1e64a660a0aa488..32b61edfd0dd163e5ef8f3d1de133c55314458b5 100644
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -25,7 +25,7 @@ class ConcatOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
-                      "Inputs(X) of ConcatOp should be empty.")
+                      "Inputs(X) of ConcatOp should be empty.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of ConcatOp should not be null.");
 
@@ -41,20 +41,24 @@ class ConcatOp : public framework::OperatorWithKernel {
       for (size_t j = 0; j < in_zero_dims_size; j++) {
         if (j == axis) {
           out_dims[axis] += ins[i][j];
-          continue;
+        } else {
+          PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
+                            "Input tensors should have the same "
+                            "elements except the specify axis.");
         }
-        PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
-                          "Input tensors should have the same "
-                          "elements except the specify axis.")
       }
     }
+    if (out_dims[axis] < 0) {
+      out_dims[axis] = -1;
+    }
     ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
 class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ConcatOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ConcatOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input tensors of concat operator.").AsDuplicable();
     AddOutput("Out", "Output tensor of concat operator.");
@@ -94,8 +98,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
-            ops::ConcatOpGrad)
+REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
+               ops::ConcatOpGrad, false)
 REGISTER_OP_CPU_KERNEL(concat,
                        ops::ConcatKernel<paddle::platform::CPUPlace, float>)
 REGISTER_OP_CPU_KERNEL(concat_grad,
diff --git a/paddle/operators/concat_op.cu.cc b/paddle/operators/concat_op.cu.cc
index ede832ddcd486729db56bba016683b33875f8837..7b46452d3d5db58799923a3dc76bb9df3471d9e7 100644
--- a/paddle/operators/concat_op.cu.cc
+++ b/paddle/operators/concat_op.cu.cc
@@ -14,7 +14,8 @@ limitations under the License. */
 
 #include "paddle/operators/concat_op.h"
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(concat,
-                       ops::ConcatKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    concat_grad, ops::ConcatGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    concat_grad,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/concat_op.h b/paddle/operators/concat_op.h
index c113f19fb5cf806709bff845ee0f1078b34014bb..de4011585af81363368a096a5c361ff3f7aeecdb 100644
--- a/paddle/operators/concat_op.h
+++ b/paddle/operators/concat_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ConcatKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -43,7 +43,7 @@ class ConcatKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ConcatGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index b809bdc3a0fea727f2fb6ea0a55672ee9b0bbd04..e333002bfd1ab40c62882f09cd207a12a0939648 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/cond_op.h"
-
 #include "paddle/operators/gather.h"
 #include "paddle/operators/scatter.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -193,20 +193,22 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
   }
 }
 
-void CondOp::Run(const Scope& scope,
-                 const platform::DeviceContext& dev_ctx) const {
+void CondOp::Run(const Scope& scope, const platform::Place& place) const {
+  // get device context from pool
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& dev_ctx = *pool.Get(place);
+
   PrepareDataForSubnet(scope, dev_ctx);
   std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
   for (int i = 0; i < BRANCH_NUM; ++i) {
-    sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
+    sub_net_op_[i]->Run(*sub_scopes[i], place);
   }
   MergeDataFromSubnet(scope, dev_ctx);
 }
 
 class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CondOpProtoAndCheckerMaker(framework::OpProto* proto,
-                             framework::OpAttrChecker* op_checker)
+  CondOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Cond", "The condition, which is a bool vector");
     AddInput("Xs", "Inputs of Subnets").AsDuplicable();
diff --git a/paddle/operators/cond_op.h b/paddle/operators/cond_op.h
index 93121fb31be287794249b5a62386d5a8dd268a0c..7dcdc47e0b2ff216bea92d083fe5897009384d39 100644
--- a/paddle/operators/cond_op.h
+++ b/paddle/operators/cond_op.h
@@ -78,7 +78,7 @@ class CondOp : public framework::OperatorBase {
   }
 
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override;
+           const platform::Place& place) const override;
 
  private:
   const int TRUE_BRANCH = 0;
diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc
index d5b124682d755ffb39f32c9f001a3cf113a01a2c..3cae61a438431e72cb24d714c761676cc0c3a41f 100644
--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <algorithm>
 #include "paddle/framework/executor.h"
 #include "paddle/framework/op_registry.h"
@@ -51,7 +51,7 @@ class ConditionalBlockOp : public ConditionalOp {
                      const framework::AttributeMap &attrs)
       : ConditionalOp(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto xs = InputTensors(scope);
     bool need_run = std::all_of(
         xs.begin(), xs.end(),
@@ -65,8 +65,8 @@ class ConditionalBlockOp : public ConditionalOp {
       scopes->front() = &scope.NewScope();
       auto &cur_scope = *scopes->front();
 
-      auto *block = Attr<framework::BlockDescBind *>("block");
-      framework::Executor exec(dev_ctx);
+      framework::Executor exec(dev_place);
+      auto *block = Attr<framework::BlockDesc *>("sub_block");
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
     }
   }
@@ -74,8 +74,7 @@ class ConditionalBlockOp : public ConditionalOp {
 
 class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ConditionalBlockOpProtoMaker(framework::OpProto *proto,
-                               framework::OpAttrChecker *op_checker)
+  ConditionalBlockOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The conditional variable of this operator. If X is empty, the "
@@ -87,8 +86,8 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
               "(std::vector<Scope*>) The step scope of conditional block. To "
               "unify the conditional block, rnn and while op, the type of "
               "scope is std::vector<Scope*>");
-    AddAttr<framework::BlockDescBind *>(
-        "block", "The step block of conditional block operator");
+    AddAttr<framework::BlockDesc *>(
+        "sub_block", "The step block of conditional block operator");
     AddComment(R"DOC(Conditional block operator
 
 Run the sub-block if X is not empty. Params is the other inputs and Out is the
@@ -105,7 +104,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
                          const framework::AttributeMap &attrs)
       : ConditionalOp(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto xs = this->InputTensors(scope);
     bool need_run = std::all_of(
         xs.begin(), xs.end(),
@@ -117,21 +116,21 @@ class ConditionalBlockGradOp : public ConditionalOp {
       auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
       framework::Scope &cur_scope = *scopes[0];
 
-      auto *block = Attr<framework::BlockDescBind *>("block");
-      framework::Executor exec(dev_ctx);
+      framework::Executor exec(dev_place);
+      auto *block = Attr<framework::BlockDesc *>("sub_block");
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
 
-      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("Params"),
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"),
                                   Outputs(framework::GradVarName("Params")));
 
-      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("X"),
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"),
                                   Outputs(framework::GradVarName("X")));
     }
   }
 
  private:
   void AssignLocalGradientToGlobal(
-      const platform::DeviceContext &dev_ctx, const framework::Scope &cur_scope,
+      const platform::Place &place, const framework::Scope &cur_scope,
       const std::vector<std::string> &p_names,
       const std::vector<std::string> &pg_names) const {
     for (size_t i = 0; i < p_names.size(); ++i) {
@@ -142,10 +141,10 @@ class ConditionalBlockGradOp : public ConditionalOp {
         continue;
       }
       auto new_in_grad_name = cur_scope.Rename(in_grad_name);
-      auto assign =
-          framework::OpRegistry::CreateOp("assign", {{"X", {new_in_grad_name}}},
-                                          {{"Out", {out_grad_name}}}, {});
-      assign->Run(cur_scope, dev_ctx);
+      auto assign = framework::OpRegistry::CreateOp(
+          "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}},
+          framework::AttributeMap{});
+      assign->Run(cur_scope, place);
       cur_scope.Rename(new_in_grad_name, in_grad_name);
     }
   }
@@ -171,18 +170,19 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto grad_op = new framework::OpDesc();
     grad_op->SetType("conditional_block_grad");
     grad_op->SetInput("X", Input("X"));
     grad_op->SetInput("Params", Input("Params"));
     grad_op->SetInput("Out", Output("Out"));
     grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     grad_op->SetInput("Scope", Output("Scope"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    grad_op->SetOutput(framework::GradVarName("Params"), InputGrad("Params"));
-    grad_op->SetBlockAttr("block", *this->grad_block_[0]);
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
+    grad_op->SetOutput(framework::GradVarName("Params"),
+                       InputGrad("Params", false));
+    grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/conv_cudnn_op.cc b/paddle/operators/conv_cudnn_op.cc
deleted file mode 100644
index c03dc3e4fb07ac6ecde42be93a1138d91778edf4..0000000000000000000000000000000000000000
--- a/paddle/operators/conv_cudnn_op.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/conv_op.h"
-
-namespace paddle {
-namespace operators {
-
-class CudnnConvOpMaker : public Conv2DOpMaker {
- public:
-  CudnnConvOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
-      : Conv2DOpMaker(proto, op_checker) {
-    AddAttr<int>("workspace_size_MB",
-                 "workspace size for cudnn, in MB, "
-                 "workspace is a section of GPU memory which will be "
-                 "allocated/freed each time the operator runs, larger "
-                 "workspace size can increase performance but also requires "
-                 "better hardware. This size should be chosen carefully.")
-        .SetDefault(4096);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(conv_cudnn, ops::ConvOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
-            ops::ConvOpGrad);
-
-REGISTER_OP_CPU_KERNEL(conv_cudnn,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv_cudnn_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc
index 5eaf6b33704eb371fff4b949c6cc32a7a5dbc812..3a5409a7e3f29a4c46839d6395760fc7fe8c086e 100644
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
@@ -28,14 +28,15 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using DataLayout = platform::DataLayout;
 
-static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 1024 * 1024 * 1024;
+static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
+    static_cast<size_t>(1024) * 1024 * 1024;
 
 template <typename T>
-class CudnnConvOpKernel : public framework::OpKernel<T> {
+class CUDNNConvOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
     auto* input = ctx.Input<Tensor>("Input");
     auto* filter = ctx.Input<Tensor>("Filter");
     auto* output = ctx.Output<Tensor>("Output");
@@ -44,7 +45,8 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     int groups = ctx.Attr<int>("groups");
-    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+    int64_t user_workspace_size =
+        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
 
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
@@ -56,6 +58,21 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
     ScopedFilterDescriptor filter_desc;
     ScopedConvolutionDescriptor conv_desc;
     DataLayout layout = DataLayout::kNCHW;
+    if (input->dims().size() == 5) {
+      layout = DataLayout::kNCDHW;
+    }
+
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+#if CUDNN_VERSION_MIN(7, 0, 1)
+    // cudnn 7 can support groups, no need to do it mannually
+    // FIXME(typhoonzero): find a better way to disable groups
+    // rather than setting it to 1.
+    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+        cudnn_conv_desc, groups));
+    groups = 1;
+#endif
 
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()), groups);
@@ -63,19 +80,34 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
         layout, framework::vectorize2int(output->dims()), groups);
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
         layout, framework::vectorize2int(filter->dims()), groups);
-    cudnnConvolutionDescriptor_t cudnn_conv_desc =
-        conv_desc.descriptor<T>(paddings, strides, dilations);
 
     int input_channels = input->dims()[1];
-    int input_height = input->dims()[2];
-    int input_width = input->dims()[3];
-    int output_channels = output->dims()[1];
-    int output_height = output->dims()[2];
-    int output_width = output->dims()[3];
+    int input_height, input_width, input_depth;
+    if (input->dims().size() == 5) {
+      input_depth = input->dims()[2];
+      input_height = input->dims()[3];
+      input_width = input->dims()[4];
+    } else {  // dim size is enforced in InferShape
+      input_depth = 1;
+      input_height = input->dims()[2];
+      input_width = input->dims()[3];
+    }
+    int output_channels = filter->dims()[0];
+    int output_height, output_width, output_depth;
+    if (output->dims().size() == 5) {
+      output_depth = output->dims()[2];
+      output_height = output->dims()[3];
+      output_width = output->dims()[4];
+    } else {
+      output_depth = 1;
+      output_height = output->dims()[2];
+      output_width = output->dims()[3];
+    }
 
-    int group_offset_in = input_channels / groups * input_height * input_width;
+    int group_offset_in =
+        input_channels / groups * input_height * input_width * input_depth;
     int group_offset_out =
-        output_channels / groups * output_height * output_width;
+        output_channels / groups * output_height * output_width * output_depth;
     int group_offset_filter = filter->numel() / groups;
     // ------------------- cudnn conv workspace ---------------------
     void* cudnn_workspace = nullptr;
@@ -86,7 +118,8 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
     }
     // ------------------- cudnn conv algorithm ---------------------
     cudnnConvolutionFwdAlgo_t algo;
-    auto handle = ctx.cuda_device_context().cudnn_handle();
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
 
     PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
@@ -97,7 +130,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, algo, &workspace_size_in_bytes));
     // Allocate on GPU memory
-    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv forward ---------------------
     T alpha = 1.0f, beta = 0.0f;
@@ -114,11 +147,11 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
 };
 
 template <typename T>
-class CudnnConvGradOpKernel : public framework::OpKernel<T> {
+class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
     auto input = ctx.Input<Tensor>("Input");
     auto filter = ctx.Input<Tensor>("Filter");
     auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
@@ -133,17 +166,32 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     int groups = ctx.Attr<int>("groups");
-    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+    int64_t user_workspace_size =
+        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
 
     // ------------------- cudnn descriptors ---------------------
     ScopedTensorDescriptor input_desc;
     ScopedTensorDescriptor output_grad_desc;
-    ScopedTensorDescriptor input_grad_desc;
 
     ScopedFilterDescriptor filter_desc;
     ScopedFilterDescriptor filter_grad_desc;
     ScopedConvolutionDescriptor conv_desc;
     DataLayout layout = DataLayout::kNCHW;
+    if (input->dims().size() == 5) {
+      layout = DataLayout::kNCDHW;
+    }
+
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+#if CUDNN_VERSION_MIN(7, 0, 1)
+    // cudnn 7 can support groups, no need to do it mannually
+    // FIXME(typhoonzero): find a better way to disable groups
+    // rather than setting it to 1.
+    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+        cudnn_conv_desc, groups));
+    groups = 1;
+#endif
 
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()), groups);
@@ -152,22 +200,35 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
             layout, framework::vectorize2int(output_grad->dims()), groups);
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
         layout, framework::vectorize2int(filter->dims()), groups);
-    cudnnTensorDescriptor_t cudnn_input_grad_desc = nullptr;
-    cudnnFilterDescriptor_t cudnn_filter_grad_desc = nullptr;
-
-    cudnnConvolutionDescriptor_t cudnn_conv_desc =
-        conv_desc.descriptor<T>(paddings, strides, dilations);
 
     int input_channels = input->dims()[1];
-    int input_height = input->dims()[2];
-    int input_width = input->dims()[3];
+    int input_height, input_width, input_depth;
+    if (input->dims().size() == 5) {
+      input_depth = input->dims()[2];
+      input_height = input->dims()[3];
+      input_width = input->dims()[4];
+    } else {  // dim size is enforced in InferShape
+      input_depth = 1;
+      input_height = input->dims()[2];
+      input_width = input->dims()[3];
+    }
+
     int output_grad_channels = filter->dims()[0];
-    int output_grad_height = output_grad->dims()[2];
-    int output_grad_width = output_grad->dims()[3];
+    int output_grad_height, output_grad_width, output_grad_depth;
+    if (input->dims().size() == 5) {
+      output_grad_depth = output_grad->dims()[2];
+      output_grad_height = output_grad->dims()[3];
+      output_grad_width = output_grad->dims()[4];
+    } else {
+      output_grad_depth = 1;
+      output_grad_height = output_grad->dims()[2];
+      output_grad_width = output_grad->dims()[3];
+    }
 
-    int group_offset_in = input_channels / groups * input_height * input_width;
-    int group_offset_out =
-        output_grad_channels / groups * output_grad_height * output_grad_width;
+    int group_offset_in =
+        input_channels / groups * input_height * input_width * input_depth;
+    int group_offset_out = output_grad_channels / groups * output_grad_height *
+                           output_grad_width * output_grad_depth;
     int group_offset_filter = filter->numel() / groups;
     // ------------------- cudnn backward algorithm ---------------------
     cudnnConvolutionBwdDataAlgo_t data_algo;
@@ -178,10 +239,9 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
       workspace_size_limit = user_workspace_size * 1024 * 1024;
     }
 
-    auto handle = ctx.cuda_device_context().cudnn_handle();
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
-      cudnn_input_grad_desc = input_grad_desc.descriptor<T>(
-          layout, framework::vectorize2int(input_grad->dims()), groups);
       PADDLE_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
               handle, cudnn_filter_desc,
@@ -190,19 +250,17 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
               cudnn_output_grad_desc, cudnn_conv_desc,
               // dxDesc: Handle to the previously initialized output tensor
               // descriptor.
-              cudnn_input_grad_desc,
+              cudnn_input_desc,
               CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
               workspace_size_limit, &data_algo));
       PADDLE_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
               handle, cudnn_filter_desc, cudnn_output_grad_desc,
-              cudnn_conv_desc, cudnn_input_grad_desc, data_algo, &tmp_size));
+              cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
       workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
     }
 
     if (filter_grad) {
-      cudnn_filter_grad_desc = filter_grad_desc.descriptor<T>(
-          layout, framework::vectorize2int(filter_grad->dims()), groups);
       PADDLE_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
               handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
@@ -219,10 +277,9 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv workspace ---------------------
     // Already on GPU
     void* cudnn_workspace = nullptr;
-    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv backward data ---------------------
-    // FIXME(typhoonzero): template type T may not be the same as cudnn call.
     T alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
@@ -233,21 +290,20 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
             handle, &alpha, cudnn_filter_desc,
             filter_data + i * group_offset_filter, cudnn_output_grad_desc,
             output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
-            cudnn_workspace, workspace_size_in_bytes, &beta,
-            cudnn_input_grad_desc, input_grad_data + i * group_offset_in));
+            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+            input_grad_data + i * group_offset_in));
       }
     }
     // ------------------- cudnn conv backward filter ---------------------
     if (filter_grad) {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset filter_grad.
-
       for (int i = 0; i < groups; i++) {
         PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
             handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
             cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
             cudnn_conv_desc, filter_algo, cudnn_workspace,
-            workspace_size_in_bytes, &beta, cudnn_filter_grad_desc,
+            workspace_size_in_bytes, &beta, cudnn_filter_desc,
             filter_grad_data + i * group_offset_filter));
       }
     }
@@ -259,8 +315,16 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(conv_cudnn, paddle::operators::CudnnConvOpKernel<float>,
-                       paddle::operators::CudnnConvOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(conv_cudnn_grad,
-                       paddle::operators::CudnnConvGradOpKernel<float>,
-                       paddle::operators::CudnnConvGradOpKernel<double>);
+REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
+                   paddle::operators::CUDNNConvOpKernel<float>,
+                   paddle::operators::CUDNNConvOpKernel<double>);
+REGISTER_OP_KERNEL(conv2d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   paddle::operators::CUDNNConvGradOpKernel<float>,
+                   paddle::operators::CUDNNConvGradOpKernel<double>);
+
+REGISTER_OP_KERNEL(conv3d, CUDNN, ::paddle::platform::CUDAPlace,
+                   paddle::operators::CUDNNConvOpKernel<float>,
+                   paddle::operators::CUDNNConvOpKernel<double>);
+REGISTER_OP_KERNEL(conv3d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   paddle::operators::CUDNNConvGradOpKernel<float>,
+                   paddle::operators::CUDNNConvGradOpKernel<double>);
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index 7a36a9b21aa6a1b415ac5a232e65eda8051c87f8..424eccdb7dc57195d20f75460032a99c76e6adcd 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/conv_op.h"
 
@@ -31,8 +31,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
   int groups = ctx->Attrs().Get<int>("groups");
   std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
-  int input_channels = in_dims[1];
-  int output_channels = filter_dims[0];
 
   PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
                  "Conv intput should be 4-D or 5-D tensor.");
@@ -45,11 +43,13 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(
       paddings.size(), strides.size(),
       "Conv paddings dimension and Conv strides dimension should be the same.");
-  PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
+
+  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups,
                     "The number of input channels should be equal to filter "
                     "channels * groups.");
+
   PADDLE_ENFORCE_EQ(
-      output_channels % groups, 0,
+      filter_dims[0] % groups, 0,
       "The number of output channels should be divided by groups.");
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
@@ -64,10 +64,27 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
                                       dilations[i], paddings[i], strides[i]));
   }
   ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
+  ctx->ShareLoD("Input", "Output");
 }
 
-Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
-                             framework::OpAttrChecker* op_checker)
+framework::OpKernelType ConvOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+
+Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput(
       "Input",
@@ -97,7 +114,7 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
       .SetDefault({0, 0});
   AddAttr<int>(
       "groups",
-      "(int default:1), the group size of convolution operator. "
+      "(int default:1), the groups number of the convolution operator. "
       "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
       "when group=2, the first half of the filters is only connected to the "
       "first half of the input channels, while the second half of the filters "
@@ -108,32 +125,57 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
                             "dilations(h_dilation, w_dilation) of "
                             "convolution operator.")
       .SetDefault({1, 1});
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
+  AddAttr<int>("workspace_size_MB",
+               "Only used in cudnn kernel. Need set use_cudnn to true."
+               "workspace size for cudnn, in MB, "
+               "workspace is a section of GPU memory which will be "
+               "allocated/freed each time the operator runs, larger "
+               "workspace size can increase performance but also requires "
+               "better hardware. This size should be chosen carefully.")
+      .SetDefault(4096);
   AddComment(R"DOC(
 Convolution Operator.
 
 The convolution operation calculates the output based on the input, filter
-and strides, paddings, groups, dilations parameters. The size of each dimension of the
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
+Input(Input) and Output(Output) are in NCHW format. Where N is batch
 size, C is the number of channels, H is the height of the feature, and W is
-the width of the feature. Parameters(ksize, strides, paddings, dilations) are two elements.
-These two elements represent height and width, respectively.
+the width of the feature.
+Filters(Input) is MCHW format. Where M is the number of output image channels, C is
+the number of input image channels, H is the height of the filter, and W
+is the width of the filter.
+Parameters(strides, paddings, dilations) are two elements. These two elements represent
+height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
 Example:
   Input:
-       Input shape: (N, C_in, H_in, W_in)
-       Filter shape: (C_out, C_in, H_f, W_f)
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
   Output:
-       Output shape: (N, C_out, H_out, W_out)
-  where
-       H_out = (H_in + 2 * paddings[0] - (dilations[0]*(filter_size[0] - 1) + 1)) / strides[0] + 1;
-       W_out = (W_in + 2 * paddings[1] - (dilations[1]*(filter_size[1] - 1) + 1)) / strides[1] + 1;
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
+  Where
+$$
+       H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
+       W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
+$$
 )DOC");
 }
 
-Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
-                             framework::OpAttrChecker* op_checker)
+Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput(
       "Input",
@@ -165,7 +207,7 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
       .SetDefault({0, 0, 0});
   AddAttr<int>(
       "groups",
-      "(int default:1), the group size of convolution operator. "
+      "(int default:1), the groups number of the convolution operator. "
       "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
       "when group=2, the first half of the filters is only connected to the "
       "first half of the input channels, while the second half of the filters "
@@ -174,32 +216,56 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
   AddAttr<std::vector<int>>("dilations",
                             "(vector<int> default:{1, 1, 1}), the "
                             "dilations(d_dilation, h_dilation, w_dilation) of "
-                            "convolution operator. Currently, conv3d doesn't "
-                            "support dilation.")
+                            "convolution operator.")
       .SetDefault({1, 1, 1});
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
+  AddAttr<int>("workspace_size_MB",
+               "Only used in cudnn kernel. workspace size for cudnn, in MB, "
+               "workspace is a section of GPU memory which will be "
+               "allocated/freed each time the operator runs, larger "
+               "workspace size can increase performance but also requires "
+               "better hardware. This size should be chosen carefully.")
+      .SetDefault(4096);
 
   AddComment(R"DOC(
 Convolution3D Operator.
 
 The convolution operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
+Input(Input) and output(Output) are in NCDHW format, where N is batch
 size, C is the number of channels,D is the depth of the feature, H is the height of
-the feature, and W is the width of the feature. Parameters(ksize, strides, paddings)
-are three elements. These three elements represent depth, height and width, respectively.
+the feature, and W is the width of the feature.
+Filters(Input) is MCDHW format, where M is the number of output image channels,
+C is the number of input image channels, D is the depth of the filter,
+H is the height of the filter, and W is the width of the filter.
+Parameters(strides, paddings, dilations) are three elements. These three elements
+represent depth, height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
 Example:
   Input:
-       Input shape: (N, C_in, D_in, H_in, W_in)
-       Filter shape: (C_out, C_in, D_f, H_f, W_f)
+       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{out}, C_{in}, D_f, H_f, W_f)$
   Output:
-       Output shape: (N, C_out, D_out, H_out, W_out)
-  where
-       D_out = (D_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
-       H_out = (H_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
-       W_out = (W_in - filter_size[2] + 2 * paddings[2]) / strides[2] + 1;
+       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       D_{out}= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{ strides[0]}+ 1 \\
+       H_{out}= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{ strides[1]}+ 1 \\
+       W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1
+  $$
 )DOC");
 }
 
@@ -214,26 +280,44 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
   }
 }
 
+framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
             ops::ConvOpGrad);
-namespace ops = paddle::operators;
 REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
             ops::ConvOpGrad);
 
-REGISTER_OP_CPU_KERNEL(conv2d,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
+    conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
 
-REGISTER_OP_CPU_KERNEL(conv3d,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
+    conv3d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc
index 546451234a1ed1a4d3119cb175c6d37ae3f0aac1..4f942444f3eb5584f07399b8d1b4d6a5087496d4 100644
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
@@ -1,31 +1,33 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/conv_op.h"
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(conv2d,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
-    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv2d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
 
-REGISTER_OP_GPU_KERNEL(conv3d,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
-    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv3d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv3d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h
index fac5f1d0e25fe205f89fc7eeb9fadfd8431517d5..5a8933e7915960f9fcbe92ae73c4f37b3b69ecaf 100644
--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
@@ -38,7 +38,7 @@ inline bool IsExpand(std::vector<int64_t>& filter_dim,
                      std::vector<int>& dilations) {
   bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
   for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j]) == 1);
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
     strides_1 = strides_1 && (strides[j] == 1);
     padding_0 = padding_0 && (paddings[j] == 0);
     dilation_1 = dilation_1 && (dilations[j] == 1);
@@ -50,29 +50,35 @@ inline bool IsExpand(std::vector<int64_t>& filter_dim,
 // operator implementations can reuse the code.
 class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv2DOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
 class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv3DOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
 class ConvOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
 };
 
 class ConvOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GemmConvKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -91,32 +97,28 @@ class GemmConvKernel : public framework::OpKernel<T> {
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
-    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
+    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
     std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    filter_shape_vec.erase(filter_shape_vec.begin(),
-                           filter_shape_vec.begin() + 2);
-
-    // output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
+    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
     std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-    output_shape_vec.erase(output_shape_vec.begin(),
-                           output_shape_vec.begin() + 2);
 
     // use col_shape in the im2col calculation
     // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
     // o_h, o_w}
-    std::vector<int64_t> col_shape_vec;
-    col_shape_vec.push_back(input->dims()[1] / groups);
-    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
-                         filter_shape_vec.end());
-    col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
-                         output_shape_vec.end());
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = input->dims()[1] / groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+    }
     framework::DDim col_shape(framework::make_ddim(col_shape_vec));
 
     // use col_matrix_shape in the gemm calculation
     // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d *
     // o_h * o_w)
     framework::DDim col_matrix_shape =
-        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
+        framework::flatten_to_2d(col_shape, data_dim + 1);
 
     bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
     Tensor col;
@@ -145,9 +147,10 @@ class GemmConvKernel : public framework::OpKernel<T> {
     int in_step = static_cast<int>(input->dims()[1]) / groups;
     int out_step = static_cast<int>(output->dims()[1]) / groups;
 
-    math::Vol2ColFunctor<Place, T> vol2col;
-    math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
+    math::Vol2ColFunctor<DeviceContext, T> vol2col;
+    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
 
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     for (int i = 0; i < batch_size; i++) {
       Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
       Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
@@ -159,29 +162,28 @@ class GemmConvKernel : public framework::OpKernel<T> {
           col.ShareDataWith(in_slice);
           col_matrix.ShareDataWith(col);
           col_matrix.Resize(col_matrix_shape);
-        } else if (filter_shape_vec.size() == 2) {
+        } else if (data_dim == 2U) {
           // im2col
-          im2col(context.device_context(), in_slice, dilations, strides,
+          im2col(dev_ctx, in_slice, dilations, strides,
                  std::vector<int>{paddings[0], paddings[1], paddings[0],
                                   paddings[1]},
                  &col);
-        } else if (filter_shape_vec.size() == 3) {
+        } else if (data_dim == 3U) {
           // vol2col
-          vol2col(context.device_context(), in_slice, dilations, strides,
-                  paddings, &col);
+          vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
         }
 
         // gemm
         Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
         Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        math::matmul<Place, T>(context.device_context(), filter_slice, false,
-                               col_matrix, false, T(1.0), &out_slice, T(0.0));
+        math::matmul<DeviceContext, T>(dev_ctx, filter_slice, false, col_matrix,
+                                       false, T(1.0), &out_slice, T(0.0));
       }
     }
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GemmConvGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -206,26 +208,22 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
-    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
+    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
     std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    filter_shape_vec.erase(filter_shape_vec.begin(),
-                           filter_shape_vec.begin() + 2);
-
-    // output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
+    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
     std::vector<int64_t> output_shape_vec(
         framework::vectorize(output_grad->dims()));
-    output_shape_vec.erase(output_shape_vec.begin(),
-                           output_shape_vec.begin() + 2);
 
     // use col_shape in the im2col calculation
     // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
     // o_h, o_w}
-    std::vector<int64_t> col_shape_vec;
-    col_shape_vec.push_back(input->dims()[1] / groups);
-    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
-                         filter_shape_vec.end());
-    col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
-                         output_shape_vec.end());
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = input->dims()[1] / groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+    }
     framework::DDim col_shape(framework::make_ddim(col_shape_vec));
 
     // use col_matrix_shape in the gemm calculation
@@ -233,7 +231,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
     // or
     // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
     framework::DDim col_matrix_shape =
-        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
+        framework::flatten_to_2d(col_shape, data_dim + 1);
 
     framework::DDim input_shape = framework::slice_ddim(
         input->dims(), 1, static_cast<int>(input->dims().size()));
@@ -264,14 +262,19 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
       col_matrix.Resize(col_matrix_shape);
     }
 
-    math::SetConstant<Place, T> set_zero;
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
 
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
-      set_zero(context.device_context(), input_grad, static_cast<T>(0));
 
-      math::Col2VolFunctor<Place, T> col2vol;
-      math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
+      // if is_expand is false, the operation of set_zero is unnecessary,
+      // because math::matmul will reset input_grad.
+      if (is_expand) {
+        set_zero(dev_ctx, input_grad, static_cast<T>(0));
+      }
+      math::Col2VolFunctor<DeviceContext, T> col2vol;
+      math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
 
       for (int i = 0; i < batch_size; i++) {
         Tensor out_grad_batch =
@@ -290,18 +293,17 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
             col_matrix.ShareDataWith(in_grad_slice);
             col_matrix.Resize(col_matrix_shape);
           }
-          math::matmul<Place, T>(context.device_context(), filter_slice, true,
-                                 out_grad_slice, false, T(1.0), &col_matrix,
-                                 T(0.0));
+          math::matmul<DeviceContext, T>(dev_ctx, filter_slice, true,
+                                         out_grad_slice, false, T(1.0),
+                                         &col_matrix, T(0.0));
 
-          if (is_expand && filter_shape_vec.size() == 2) {
-            col2im(context.device_context(), col, dilations, strides,
+          if (is_expand && data_dim == 2U) {
+            col2im(dev_ctx, col, dilations, strides,
                    std::vector<int>{paddings[0], paddings[1], paddings[0],
                                     paddings[1]},
                    &in_grad_slice);
-          } else if (is_expand && filter_shape_vec.size() == 3) {
-            col2vol(context.device_context(), col, dilations, strides, paddings,
-                    &in_grad_slice);
+          } else if (is_expand && data_dim == 3U) {
+            col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
           }
         }
       }
@@ -311,9 +313,9 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
       filter_grad->mutable_data<T>(context.GetPlace());
       Tensor filter_grad_ = *filter_grad;
       filter_grad_.Resize(filter_matrix_shape);
-      set_zero(context.device_context(), filter_grad, static_cast<T>(0));
-      math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
-      math::Vol2ColFunctor<Place, T> vol2col;
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
+      math::Vol2ColFunctor<DeviceContext, T> vol2col;
       for (int i = 0; i < batch_size; i++) {
         Tensor out_grad_batch =
             output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
@@ -328,22 +330,21 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
             col.ShareDataWith(in_slice);
             col_matrix.ShareDataWith(col);
             col_matrix.Resize(col_matrix_shape);
-          } else if (filter_shape_vec.size() == 2) {
-            im2col(context.device_context(), in_slice, dilations, strides,
+          } else if (data_dim == 2U) {
+            im2col(dev_ctx, in_slice, dilations, strides,
                    std::vector<int>{paddings[0], paddings[1], paddings[0],
                                     paddings[1]},
                    &col);
-          } else if (filter_shape_vec.size() == 3) {
-            vol2col(context.device_context(), in_slice, dilations, strides,
-                    paddings, &col);
+          } else if (data_dim == 3U) {
+            vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
           }
 
           // gemm
           Tensor filter_grad_slice =
               filter_grad_.Slice(g * out_step, (g + 1) * out_step);
-          math::matmul<Place, T>(context.device_context(), out_grad_slice,
-                                 false, col_matrix, true, T(1.0),
-                                 &filter_grad_slice, T(1.0));
+          math::matmul<DeviceContext, T>(dev_ctx, out_grad_slice, false,
+                                         col_matrix, true, T(1.0),
+                                         &filter_grad_slice, T(1.0));
         }
       }
     }
diff --git a/paddle/operators/conv_shift_op.cc b/paddle/operators/conv_shift_op.cc
index a4150a5664690e750d2501a1849767c23209186b..106b68a0a0e787a0c9da2de924f4646c77b42b41 100644
--- a/paddle/operators/conv_shift_op.cc
+++ b/paddle/operators/conv_shift_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/conv_shift_op.h"
 #include "paddle/framework/eigen.h"
@@ -75,8 +75,7 @@ class ConvShiftGradOp : public framework::OperatorWithKernel {
 
 class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ConvShiftOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ConvShiftOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
diff --git a/paddle/operators/conv_shift_op.cu b/paddle/operators/conv_shift_op.cu
index 95e13c38a8dd234f49393d2d4808607a447b0d4c..cf7abc196e1293ab1b998d1a8cb9c361a7c2d427 100644
--- a/paddle/operators/conv_shift_op.cu
+++ b/paddle/operators/conv_shift_op.cu
@@ -1,16 +1,16 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/conv_shift_op.h"
 #include "paddle/operators/math/math_function.h"
@@ -111,7 +111,8 @@ __global__ void ConvShiftDy(const T *x, const T *dout, int x_width, int y_width,
 }  // namespace
 
 template <typename T>
-class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
+class ConvShiftKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     const Tensor *X = context.Input<Tensor>("X");
@@ -132,7 +133,8 @@ class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
 
     dim3 grid_dim(num_x_blocks, batch_size);
 
-    auto stream = context.cuda_device_context().stream();
+    auto stream =
+        context.template device_context<platform::CUDADeviceContext>().stream();
 
     ConvShiftForward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
         x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data);
@@ -140,7 +142,7 @@ class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
 };
 
 template <typename T>
-class ConvShiftGradKernel<platform::GPUPlace, T>
+class ConvShiftGradKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -159,8 +161,9 @@ class ConvShiftGradKernel<platform::GPUPlace, T>
     int y_width = Y->dims()[1];
     int y_half_width = (y_width - 1) / 2;
 
-    auto &device_ctx = context.cuda_device_context();
-    math::SetConstant<platform::GPUPlace, T> zero;
+    auto &device_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    math::SetConstant<platform::CUDADeviceContext, T> zero;
 
     const int x_per_block = 256;
     int num_x_blocks = DivUp(x_width, x_per_block);
@@ -186,8 +189,9 @@ class ConvShiftGradKernel<platform::GPUPlace, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(conv_shift,
-                       ops::ConvShiftKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    conv_shift,
+    ops::ConvShiftKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     conv_shift_grad,
-    ops::ConvShiftGradKernel<paddle::platform::GPUPlace, float>);
+    ops::ConvShiftGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/conv_shift_op.h b/paddle/operators/conv_shift_op.h
index 5a160b0f1696c70868fc48d219b38cde2018e8a3..6781d87ef0d99a0b0fc4747245920b6a38a33804 100644
--- a/paddle/operators/conv_shift_op.h
+++ b/paddle/operators/conv_shift_op.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/op_registry.h"
@@ -18,13 +18,13 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ConvShiftKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ConvShiftGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override;
diff --git a/paddle/operators/conv_transpose_cudnn_op.cc b/paddle/operators/conv_transpose_cudnn_op.cc
deleted file mode 100644
index 0192178ce3a0a47196232f0723baec8324bea60b..0000000000000000000000000000000000000000
--- a/paddle/operators/conv_transpose_cudnn_op.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/conv_transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
- public:
-  CudnnConv2DTransposeOpMaker(framework::OpProto* proto,
-                              framework::OpAttrChecker* op_checker)
-      : Conv2DTransposeOpMaker(proto, op_checker) {
-    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
-        .SetDefault({1, 1});
-    AddAttr<int>("workspace_size_MB",
-                 "workspace size for cudnn, in MB, "
-                 "workspace is a section of GPU memory which will be "
-                 "allocated/freed each time the operator runs, larger "
-                 "workspace size can increase performance but also requires "
-                 "better hardward. This size should be carefully setted.")
-        .SetDefault(4096);
-  }
-};
-
-class CudnnConv3DTransposeOpMaker : public Conv3DTransposeOpMaker {
- public:
-  CudnnConv3DTransposeOpMaker(framework::OpProto* proto,
-                              framework::OpAttrChecker* op_checker)
-      : Conv3DTransposeOpMaker(proto, op_checker) {
-    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
-        .SetDefault({1, 1, 1});
-    AddAttr<int>("workspace_size_MB",
-                 "workspace size for cudnn, in MB, "
-                 "workspace is a section of GPU memory which will be "
-                 "allocated/freed each time the operator runs, larger "
-                 "workspace size can increase performance but also requires "
-                 "better hardward. This size should be carefully setted.")
-        .SetDefault(4096);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(conv2d_transpose_cudnn, ops::ConvTransposeOp,
-            ops::CudnnConv2DTransposeOpMaker, conv2d_transpose_cudnn_grad,
-            ops::ConvTransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    conv2d_transpose_cudnn,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_transpose_cudnn_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
-
-REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp,
-            ops::CudnnConv3DTransposeOpMaker, conv3d_transpose_cudnn_grad,
-            ops::ConvTransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    conv3d_transpose_cudnn,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d_transpose_cudnn_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/conv_transpose_cudnn_op.cu.cc b/paddle/operators/conv_transpose_cudnn_op.cu.cc
index 494904fe524ae30a5032e489a0c5f20179d8e8ce..23bc97e13c13e5c1d62a406be5e528ab272fa93a 100644
--- a/paddle/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
@@ -28,14 +28,14 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using DataLayout = platform::DataLayout;
 
-static constexpr size_t kConvCudnnWorkspaceLimitBytes = 1024 * 1024 * 1024;
+static constexpr size_t kConvCUDNNWorkspaceLimitBytes = 1024 * 1024 * 1024;
 
 template <typename T>
-class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
+class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
     auto* input = ctx.Input<Tensor>("Input");
     auto* filter = ctx.Input<Tensor>("Filter");
     auto* output = ctx.Output<Tensor>("Output");
@@ -77,13 +77,14 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv workspace ---------------------
     void* cudnn_workspace = nullptr;
     size_t workspace_size_in_bytes;  // final workspace to allocate.
-    size_t workspace_size_limit = kConvCudnnWorkspaceLimitBytes;
+    size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
     if (user_workspace_size > 0) {
       workspace_size_limit = user_workspace_size * 1024 * 1024;
     }
     // ------------------- cudnn conv algorithm ---------------------
     cudnnConvolutionBwdDataAlgo_t algo;
-    auto handle = ctx.cuda_device_context().cudnn_handle();
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
     // Get the algorithm
     PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
         handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
@@ -99,7 +100,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
             cudnn_output_desc, algo, &workspace_size_in_bytes));
 
     // Allocate on GPU memory
-    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
 
     // ------------------- cudnn conv transpose forward ---------------------
@@ -115,11 +116,11 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
 };
 
 template <typename T>
-class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
+class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
     auto input = ctx.Input<Tensor>("Input");
     auto filter = ctx.Input<Tensor>("Filter");
     auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
@@ -160,12 +161,13 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
     cudnnConvolutionBwdFilterAlgo_t filter_algo;
     size_t bwd_filter_ws_size, fwd_ws_size;
     size_t workspace_size_in_bytes = 0;
-    size_t workspace_size_limit = kConvCudnnWorkspaceLimitBytes;
+    size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
     if (user_workspace_size > 0) {
       workspace_size_limit = user_workspace_size * 1024 * 1024;
     }
 
-    auto handle = ctx.cuda_device_context().cudnn_handle();
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
       // choose backward algorithm for data
       PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
@@ -199,7 +201,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv workspace ---------------------
     // Already on GPU
     void* cudnn_workspace = nullptr;
-    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv backward data ---------------------
     // FIXME(typhoonzero): template type T may not be the same as cudnn call.
@@ -234,16 +236,16 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn,
-                       ops::CudnnConvTransposeOpKernel<float>,
-                       ops::CudnnConvTransposeOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn_grad,
-                       ops::CudnnConvTransposeGradOpKernel<float>,
-                       ops::CudnnConvTransposeGradOpKernel<double>);
-
-REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn,
-                       ops::CudnnConvTransposeOpKernel<float>,
-                       ops::CudnnConvTransposeOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn_grad,
-                       ops::CudnnConvTransposeGradOpKernel<float>,
-                       ops::CudnnConvTransposeGradOpKernel<double>);
+REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::CUDNNConvTransposeOpKernel<float>,
+                   ops::CUDNNConvTransposeOpKernel<double>);
+REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::CUDNNConvTransposeGradOpKernel<float>,
+                   ops::CUDNNConvTransposeGradOpKernel<double>);
+
+REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::CUDNNConvTransposeOpKernel<float>,
+                   ops::CUDNNConvTransposeOpKernel<double>);
+REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::CUDNNConvTransposeGradOpKernel<float>,
+                   ops::CUDNNConvTransposeGradOpKernel<double>);
diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc
index 3e55ef036a7fb976117054574d1347fa943acd55..cf4e8c0a303d6888c3b1f2475a483c4bfc90981b 100644
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/conv_transpose_op.h"
 
@@ -29,6 +29,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
   auto filter_dims = ctx->GetInputDim("Filter");
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
 
   PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
                  "ConvTransposeOp intput should be 4-D or 5-D tensor.");
@@ -39,7 +40,10 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
                  "ConvTransposeOp input dimension and strides dimension should "
                  "be consistent.");
   PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
-                    "ConvTransposeOp paddings dimension and Conv strides "
+                    "ConvTransposeOp paddings dimension and strides "
+                    "dimension should be the same.");
+  PADDLE_ENFORCE_EQ(paddings.size(), dilations.size(),
+                    "ConvTransposeOp paddings dimension and dilations "
                     "dimension should be the same.");
   PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
                     "In ConvTransposeOp, The input channel should be the same "
@@ -47,14 +51,32 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
   for (size_t i = 0; i < strides.size(); ++i) {
+    auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
     output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
-                           filter_dims[i + 2]);
+                           filter_extent);
   }
   ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
 }
 
-Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
-    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+
+Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto,
+                                               OpAttrChecker* op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput(
       "Input",
@@ -62,52 +84,83 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
       "The format of input tensor is NCHW. Where N is batch size, C is the "
       "number of input channels, H is the height of the feature, and "
       "W is the width of the feature.");
-  AddInput("Filter",
-           "(Tensor) The filter tensor of convolution transpose operator. "
-           "The format of the filter tensor is CMHW, where C is the number of "
-           "output image channels, M is the number of input image channels, "
-           "H is the height of the filter, and W is the width of the filter. "
-           "We enforce groups number == 1 and padding == 0 in "
-           "the convolution transpose scenario.");
+  AddInput(
+      "Filter",
+      "(Tensor) The filter tensor of convolution transpose operator. "
+      "The format of the filter tensor is MCHW, where M is the number of "
+      "input feature channels, C is the number of "
+      "output feature channels,"
+      "H is the height of the filter, and W is the width of the filter. "
+      "We enforce groups number == 1 in the convolution transpose scenario.");
   AddOutput("Output",
             "(Tensor) The output tensor of convolution transpose operator. "
             "The format of output tensor is also NCHW.");
+
+  AddAttr<std::vector<int>>("dilations",
+                            "(vector<int> default:{1, 1}), the "
+                            "dilations(h_dilation, w_dilation) of convolution "
+                            "transpose operator.")
+      .SetDefault({1, 1});
   AddAttr<std::vector<int>>(
       "strides",
-      "(vector<int> defalut:{1, 1}), the strides(h_stride, w_stride) of "
+      "(vector<int> default:{1, 1}), the strides(h_stride, w_stride) of "
       "convolution transpose operator.")
       .SetDefault({1, 1});
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector<int> defalut:{0, 0}), the paddings(h_pad, w_pad) of convolution "
+      "(vector<int> default:{0, 0}), the paddings(h_pad, w_pad) of convolution "
       "transpose operator.")
       .SetDefault({0, 0});
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
+  AddAttr<int>("workspace_size_MB",
+               "Used in cudnn kernel only. workspace size for cudnn, in MB, "
+               "workspace is a section of GPU memory which will be "
+               "allocated/freed each time the operator runs, larger "
+               "workspace size can increase performance but also requires "
+               "better hardward. This size should be carefully setted.")
+      .SetDefault(4096);
   AddComment(R"DOC(
 Convolution2D Transpose Operator.
 
 The convolution transpose operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
+and dilations, strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-
-Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
-size, C is the number of channels, H is the height of the feature, and 
-W is the width of the feature. Parameters(ksize, strides, paddings) are two elements.
-These two elements represent height and width, respectively.
+Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the
+number of channels, H is the height of the feature, and W is the width of the feature.
+Filter(Input) is in MCHW format. Where M is the number of input feature channels,
+C is the number of output feature channels, H is the height of the filter,
+and W is the width of the filter.
+Parameters(strides, paddings) are two elements. These two elements represent height
+and width, respectively.
 The input(X) size and output(Out) size may be different.
+
 Example:
   Input:
-       Input shape: (N, C_in, H_in, W_in)
-       Filter shape: (C_in, C_out, H_f, W_f)
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
   Output:
-       Output shape: (N, C_out, H_out, W_out)
-  where
-       H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
-       W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + H_f \\
+       W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + W_f
+  $$
 )DOC");
 }
 
-Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
-    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto,
+                                               OpAttrChecker* op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput("Input",
            "(Tensor) The input tensor of convolution transpose operator."
@@ -117,8 +170,9 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
            "W is the width of the feature.");
   AddInput("Filter",
            "(Tensor) The filter tensor of convolution transpose operator."
-           "The format of the filter tensor is CMDHW, where C is the number of "
-           "output image channels, M is the number of input image channels, D "
+           "The format of the filter tensor is MCDHW, where M is the number of "
+           "input feature channels, C is the number of "
+           "output feature channels, D "
            "is the depth of the filter, H is the height of the filter, and "
            "W is the width of the filter."
            "We enforce groups number == 1 and padding == 0 in "
@@ -129,38 +183,69 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
             "Where N is batch size, C is "
             "the number of channels, D is the depth of the feature, H is the "
             "height of the feature, and W is the width of the feature.");
+
+  AddAttr<std::vector<int>>(
+      "dilations",
+      "(vector<int> default:{1, 1, 1}), the "
+      "dilations(d_dilation,h_dilation, w_dilation) of convolution "
+      "transpose operator.")
+      .SetDefault({1, 1, 1});
   AddAttr<std::vector<int>>("strides",
-                            "(vector<int> defalut:{1, 1, 1}), the "
+                            "(vector<int> default:{1, 1, 1}), the "
                             "strides{d_stride, h_stride, w_stride} of "
                             "convolution transpose operator.")
       .SetDefault({1, 1, 1});
   AddAttr<std::vector<int>>("paddings",
-                            "(vector<int> defalut:{0, 0, 0}), paddings(d_pad, "
+                            "(vector<int> default:{0, 0, 0}), paddings(d_pad, "
                             "h_pad, w_pad) of convolution transpose operator.")
       .SetDefault({0, 0, 0});
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
+  AddAttr<int>("workspace_size_MB",
+               "Used in cudnn kernel only. workspace size for cudnn, in MB, "
+               "workspace is a section of GPU memory which will be "
+               "allocated/freed each time the operator runs, larger "
+               "workspace size can increase performance but also requires "
+               "better hardward. This size should be carefully setted.")
+      .SetDefault(4096);
   AddComment(R"DOC(
 Convolution3D Transpose Operator.
 
 The convolution transpose operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
+and dilations, strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-
-Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
-size, C is the number of channels, D is the depth of the feature, 
-H is the height of the feature, and W is the width of the feature. 
-Parameters(ksize, strides, paddings) are three elements.
-These three elements represent depth, height and width, respectively.
+Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the
+number of channels, D is the depth of the feature, H is the height of the feature,
+and W is the width of the feature.
+Filter(Input) is in MCDHW format. Where M is the number of input feature channels,
+C is the number of output feature channels, D is the depth of the filter,H is the
+height of the filter, and W is the width of the filter.
+Parameters(strides, paddings) are three elements. These three elements represent
+depth, height and width, respectively.
 The input(X) size and output(Out) size may be different.
-Example:
+
+Example:   
   Input:
-       Input shape: (N, C_in, D_in, H_in, W_in)
-       Filter shape: (C_in, C_out, D_f, H_f, W_f)
+       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
   Output:
-       Output shape: (N, C_out, D_out, H_out, W_out)
-  where
-       D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
-       H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
-       W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + filter_size[2];
+       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + D_f \\
+       H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + H_f \\
+       W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + W_f
+  $$
 )DOC");
 }
 
@@ -175,6 +260,23 @@ void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const {
   }
 }
 
+framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -185,21 +287,23 @@ REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
 
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
 
 REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
             conv3d_transpose_grad, ops::ConvTransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
diff --git a/paddle/operators/conv_transpose_op.cu.cc b/paddle/operators/conv_transpose_op.cu.cc
index 4165eb0c7b048b83bbd94c57b971530043b66545..f1d827c606283440debb9a0edb25168816a3a08c 100644
--- a/paddle/operators/conv_transpose_op.cu.cc
+++ b/paddle/operators/conv_transpose_op.cu.cc
@@ -1,35 +1,39 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/conv_transpose_op.h"
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
     conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     double>);
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
     conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     double>);
diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h
index ab336ad23ce1c180b68d04e4c85b299e301d5376..a42ade41b165d1bfa00d2db0e45d40cf5d7b00bc 100644
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -30,29 +30,35 @@ using DDim = framework::DDim;
 // operator implementations can reuse the code.
 class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv2DTransposeOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker);
+  Conv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
 class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv3DTransposeOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker);
+  Conv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
 class ConvTransposeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
 };
 
 class ConvTransposeOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GemmConvTransposeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -63,35 +69,31 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
 
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    // TODO(Zhuoyuan): Paddings can be added in future.
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
     // groups will alway be disabled in conv2dtranspose.
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
-    // input_shape_vec: {h, w} or {d, h, w}
+    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w}
     std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
-    input_shape_vec.erase(input_shape_vec.begin(), input_shape_vec.begin() + 2);
-
-    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
+    // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w}
     std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
-    filter_shape_vec.erase(filter_shape_vec.begin(),
-                           filter_shape_vec.begin() + 2);
 
     // use col_shape in the im2col and col2im (or vol2col and col2vol)
     // calculation
     // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
-    std::vector<int64_t> col_shape_vec;
-    col_shape_vec.push_back(output->dims()[1]);
-    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
-                         filter_shape_vec.end());
-    col_shape_vec.insert(col_shape_vec.end(), input_shape_vec.begin(),
-                         input_shape_vec.end());
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = output->dims()[1];
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
+    }
     DDim col_shape(framework::make_ddim(col_shape_vec));
 
     // use col_matrix_shape in the gemm calculation
     // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
-    DDim col_matrix_shape =
-        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
+    DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
 
     Tensor col;
     col.mutable_data<T>(col_shape, context.GetPlace());
@@ -114,12 +116,12 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     filter.Resize(filter_matrix_shape);
 
     output->mutable_data<T>(context.GetPlace());
-    math::SetConstant<Place, T> set_zero;
-    set_zero(context.device_context(), output, static_cast<T>(0));
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    set_zero(dev_ctx, output, static_cast<T>(0));
 
-    math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
-    math::Col2VolFunctor<Place, T> col2vol;
-    std::vector<int> dilations({1, 1, 1});
+    math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
+    math::Col2VolFunctor<DeviceContext, T> col2vol;
 
     // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
     // on input)
@@ -132,29 +134,27 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
 
       // col_matrix = filter * input_batch
       // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
-      math::matmul<Place, T>(context.device_context(), filter, true,
-                             input_batch, false, static_cast<T>(1.0),
-                             &col_matrix, static_cast<T>(0.0));
+      math::matmul<DeviceContext, T>(dev_ctx, filter, true, input_batch, false,
+                                     static_cast<T>(1.0), &col_matrix,
+                                     static_cast<T>(0.0));
 
-      if (filter_shape_vec.size() == 2) {
+      if (data_dim == 2U) {
         // col2im: col_matrix -> dy
         // from (c * k_h * k_w, h * w) to (c, o_h, o_w)
-        col2im(context.device_context(), col,
-               std::vector<int>{dilations[0], dilations[1]}, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
+        col2im(dev_ctx, col, std::vector<int>{dilations[0], dilations[1]},
+               strides, std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                         paddings[1]},
                &output_batch);
-      } else if (filter_shape_vec.size() == 3) {
+      } else if (data_dim == 3U) {
         // col2vol: col_matrix -> dy
         // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
-        col2vol(context.device_context(), col, dilations, strides, paddings,
-                &output_batch);
+        col2vol(dev_ctx, col, dilations, strides, paddings, &output_batch);
       }
     }
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -173,33 +173,30 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
 
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
-    // input_shape_vec: {h, w} or {d, h, w}
+    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w}
     std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
-    input_shape_vec.erase(input_shape_vec.begin(), input_shape_vec.begin() + 2);
-
-    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
+    // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w}
     std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
-    filter_shape_vec.erase(filter_shape_vec.begin(),
-                           filter_shape_vec.begin() + 2);
 
     // use col_shape in the im2col and col2im (or vol2col and col2vol)
     // calculation
     // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
-    std::vector<int64_t> col_shape_vec;
-    col_shape_vec.push_back(output_grad->dims()[1]);
-    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
-                         filter_shape_vec.end());
-    col_shape_vec.insert(col_shape_vec.end(), input_shape_vec.begin(),
-                         input_shape_vec.end());
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = output_grad->dims()[1];
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
+    }
     DDim col_shape(framework::make_ddim(col_shape_vec));
 
     // use col_matrix_shape in the gemm calculation
     // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
-    DDim col_matrix_shape =
-        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
+    DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
 
     // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
     DDim output_shape = framework::slice_ddim(output_grad->dims(), 1,
@@ -215,6 +212,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
     // convolution transpose grad on input:
     // im2col + gemm (similar to conv-forward)
     // input need to compute gradient
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (input_grad || filter_grad) {
       Tensor col;
       col.mutable_data<T>(col_shape, context.GetPlace());
@@ -226,19 +224,17 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
       col_matrix.Resize(col_matrix_shape);
 
       Tensor filter_grad_;
-      math::SetConstant<Place, T> set_zero;
+      math::SetConstant<DeviceContext, T> set_zero;
 
-      math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
-      math::Vol2ColFunctor<Place, T> vol2col;
-      std::vector<int> dilations({1, 1, 1});
+      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
+      math::Vol2ColFunctor<DeviceContext, T> vol2col;
 
       if (input_grad) {
         input_grad->mutable_data<T>(context.GetPlace());
-        set_zero(context.device_context(), input_grad, static_cast<T>(0));
       }
       if (filter_grad) {  // filter size (m, c, k_h, k_w)
         filter_grad->mutable_data<T>(context.GetPlace());
-        set_zero(context.device_context(), filter_grad, static_cast<T>(0));
+        set_zero(dev_ctx, filter_grad, static_cast<T>(0));
         filter_grad_ = *filter_grad;
         filter_grad_.Resize(filter_matrix_shape);
       }
@@ -248,19 +244,19 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
         Tensor output_grad_batch =
             output_grad->Slice(i, i + 1).Resize(output_shape);
 
-        if (filter_shape_vec.size() == 2) {
+        if (data_dim == 2U) {
           // im2col: dy -> col matrix
           // from (c, o_h, o_w) to (c * k_h * k_w, h * w)
-          im2col(context.device_context(), output_grad_batch,
+          im2col(dev_ctx, output_grad_batch,
                  std::vector<int>{dilations[0], dilations[1]}, strides,
                  std::vector<int>{paddings[0], paddings[1], paddings[0],
                                   paddings[1]},
                  &col);
-        } else if (filter_shape_vec.size() == 3) {
+        } else if (data_dim == 3U) {
           // vol2col: dy -> col_matrix
           // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w)
-          vol2col(context.device_context(), output_grad_batch, dilations,
-                  strides, paddings, &col);
+          vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings,
+                  &col);
         }
 
         if (input_grad) {
@@ -272,9 +268,9 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
           // or
           // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
           // d, h, w)
-          math::matmul<Place, T>(context.device_context(), filter, false,
-                                 col_matrix, false, static_cast<T>(1.0),
-                                 &input_grad_batch, static_cast<T>(0.0));
+          math::matmul<DeviceContext, T>(
+              dev_ctx, filter, false, col_matrix, false, static_cast<T>(1.0),
+              &input_grad_batch, static_cast<T>(0.0));
         }
         if (filter_grad) {
           // input batch
@@ -284,9 +280,9 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
           // or
           // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
           // k_h * k_w)
-          math::matmul<Place, T>(context.device_context(), in_batch, false,
-                                 col_matrix, true, static_cast<T>(1.0),
-                                 &filter_grad_, static_cast<T>(1.0));
+          math::matmul<DeviceContext, T>(dev_ctx, in_batch, false, col_matrix,
+                                         true, static_cast<T>(1.0),
+                                         &filter_grad_, static_cast<T>(1.0));
         }
       }
     }
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
index 312264ccd48d1405a247a2c864d9f5897c897bea..9019a1edb379be4007e38d3c0dc71feae23ae4e8 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/cos_sim_op.h"
 
@@ -62,7 +62,7 @@ class CosSimOp : public framework::OperatorWithKernel {
 
 class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CosSimOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  CosSimOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The 1st input of cos_sim op.");
     AddInput("Y", "The 2nd input of cos_sim op.");
@@ -155,7 +155,8 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad,
             ops::CosSimOpGrad);
-REGISTER_OP_CPU_KERNEL(cos_sim,
-                       ops::CosSimKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    cos_sim_grad, ops::CosSimGradKernel<paddle::platform::CPUPlace, float>);
+    cos_sim, ops::CosSimKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    cos_sim_grad,
+    ops::CosSimGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu
index 0cb8fd26de47a4a464db98664263544e3e503d63..9e5d1b6e4f0b6e482edd96df93d535e05dba3bc6 100644
--- a/paddle/operators/cos_sim_op.cu
+++ b/paddle/operators/cos_sim_op.cu
@@ -1,22 +1,23 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/cos_sim_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(cos_sim,
-                       ops::CosSimKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    cos_sim_grad, ops::CosSimGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    cos_sim, ops::CosSimKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    cos_sim_grad,
+    ops::CosSimGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
index 62a4e484eceeabc4cc26e68ac54a50be1ac95df7..eadcca55f9bfc3e59f329df8ff419ad4c5a29007 100644
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -1,33 +1,29 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/cos_sim_functor.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CosSimKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -41,32 +37,30 @@ class CosSimKernel : public framework::OpKernel<T> {
     out_x_norm->mutable_data<T>(context.GetPlace());
     out_y_norm->mutable_data<T>(context.GetPlace());
 
-    // convert Tensor to Eigen Tensor
     int rows_x = in_x->dims()[0];
     int rows_y = in_y->dims()[0];
-    auto x = EigenMatrix<T>::Reshape(*in_x, 1);
-    auto y = EigenMatrix<T>::Reshape(*in_y, 1);
-    auto z = EigenVector<T>::Flatten(*out_z);
-    auto x_norm = EigenVector<T>::Flatten(*out_x_norm);
-    auto y_norm = EigenVector<T>::Flatten(*out_y_norm);
 
-    // compute
-    auto place = context.GetEigenDevice<Place>();
-    auto row_along = Eigen::array<int, 1>({{1}});
-    x_norm.device(place) = x.square().sum(row_along).sqrt();
-    y_norm.device(place) = y.square().sum(row_along).sqrt();
+    int cols = framework::product(in_x->dims()) / rows_x;
+
     if (rows_x == rows_y) {
-      auto xy = (x * y).sum(Eigen::array<int, 1>({{1}}));
-      z.device(place) = xy / x_norm / y_norm;
+      math::CosSimFunctor<T, true> functor(
+          in_x->data<T>(), in_y->data<T>(), out_x_norm->data<T>(),
+          out_y_norm->data<T>(), out_z->data<T>(), cols);
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(context.device_context()), rows_x);
+      for_range(functor);
     } else {
-      Eigen::DSizes<int, 2> bcast(rows_x, 1);
-      auto xy = (x * y.broadcast(bcast)).sum(row_along);
-      z.device(place) = xy / x_norm / y_norm.broadcast(bcast);
+      math::CosSimFunctor<T, false> functor(
+          in_x->data<T>(), in_y->data<T>(), out_x_norm->data<T>(),
+          out_y_norm->data<T>(), out_z->data<T>(), cols);
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(context.device_context()), rows_x);
+      for_range(functor);
     }
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CosSimGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -80,61 +74,54 @@ class CosSimGradKernel : public framework::OpKernel<T> {
     auto* out_grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
     auto* in_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
 
-    // convert Tensor to Eigen Tensor
-    auto x = EigenMatrix<T>::Reshape(*in_x, 1);
-    auto y = EigenMatrix<T>::Reshape(*in_y, 1);
-    auto z = EigenMatrix<T>::Reshape(*in_z, 1);
-    auto x_norm = EigenMatrix<T>::Reshape(*in_x_norm, 1);
-    auto y_norm = EigenMatrix<T>::Reshape(*in_y_norm, 1);
-    auto dz = EigenMatrix<T>::Reshape(*in_grad_z, 1);
-
     // compute gradident
     int rows_x = in_x->dims()[0];
     int rows_y = in_y->dims()[0];
     int cols = framework::product(in_x->dims()) / rows_x;
-    Eigen::DSizes<int, 2> bcast_cols(1, cols);
-    auto z_bcast = z.broadcast(bcast_cols);
-    auto dz_bcast = dz.broadcast(bcast_cols);
-    auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast_cols);
-    auto place = context.GetEigenDevice<Place>();
+
     if (rows_x == rows_y) {
-      auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_cols);
-      auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast_cols);
-      // compute dx
       if (out_grad_x) {
-        out_grad_x->mutable_data<T>(context.GetPlace());
-        auto dx = EigenMatrix<T>::Reshape(*out_grad_x, 1);
-        auto grad = y / norm_prod_bcast - z_bcast * x / x_snorm_bcast;
-        dx.device(place) = dz_bcast * grad;
+        math::CosSimGradFunctor<T> functor(
+            in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
+            in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
+            out_grad_x->mutable_data<T>(context.GetPlace()), cols);
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(context.device_context()),
+            rows_x);
+        for_range(functor);
       }
-      // compute dy
       if (out_grad_y) {
-        out_grad_y->mutable_data<T>(context.GetPlace());
-        auto dy = EigenMatrix<T>::Reshape(*out_grad_y, 1);
-        auto grad = x / norm_prod_bcast - z_bcast * y / y_snorm_bcast;
-        dy.device(place) = dz_bcast * grad;
+        math::CosSimGradFunctor<T> functor(
+            in_y_norm->data<T>(), in_x_norm->data<T>(), in_y->data<T>(),
+            in_x->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
+            out_grad_y->mutable_data<T>(context.GetPlace()), cols);
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(context.device_context()),
+            rows_x);
+        for_range(functor);
       }
     } else {
-      Eigen::DSizes<int, 2> bcast_rows(rows_x, 1);
-      Eigen::DSizes<int, 2> bcast_rows_cols(rows_x, cols);
-      auto y_bcast = y.broadcast(bcast_rows);
-      auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_rows_cols);
-      auto norm_prod_bcast = (x_norm * y_norm.eval().broadcast(bcast_rows))
-                                 .eval()
-                                 .broadcast(bcast_cols);
-      // compute dx
       if (out_grad_x) {
-        out_grad_x->mutable_data<T>(context.GetPlace());
-        auto dx = EigenMatrix<T>::Reshape(*out_grad_x, 1);
-        auto grad = y_bcast / norm_prod_bcast - z_bcast * x / x_snorm_bcast;
-        dx.device(place) = dz_bcast * grad;
+        math::CosSimDxFunctor<T> functor(
+            in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
+            in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
+            out_grad_x->mutable_data<T>(context.GetPlace()), cols);
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(context.device_context()),
+            rows_x);
+        for_range(functor);
       }
-      // compute dy
       if (out_grad_y) {
         out_grad_y->mutable_data<T>(context.GetPlace());
-        auto dy = EigenVector<T>::Flatten(*out_grad_y);
-        auto grad = x / norm_prod_bcast - z_bcast * y_bcast / y_snorm_bcast;
-        dy.device(place) = (dz_bcast * grad).sum(Eigen::array<int, 1>({{0}}));
+        math::SetConstant<DeviceContext, T> set_zero;
+        auto& dev_ctx = context.template device_context<DeviceContext>();
+        set_zero(dev_ctx, out_grad_y, static_cast<T>(0));
+
+        math::CosSimDyFunctor<DeviceContext, T> functor;
+        functor(dev_ctx, in_x_norm->data<T>(), in_y_norm->data<T>(),
+                in_x->data<T>(), in_y->data<T>(), in_z->data<T>(),
+                in_grad_z->data<T>(), static_cast<size_t>(rows_x),
+                static_cast<size_t>(cols), out_grad_y->data<T>());
       }
     }
   }
diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc
index f418f489c0ff471464a23380598e9f4c8da16ca9..30626028c137a4ac3acf67f37ba5c5bb75215b3a 100644
--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
@@ -18,8 +18,7 @@ namespace paddle {
 namespace operators {
 class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CRFDecodingOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  CRFDecodingOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Emission",
              "(LoDTensor, default: LoDTensor<float>). A LoDTensor with shape "
@@ -36,17 +35,18 @@ class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
         "w. See more details in comments of the linear_chain_crf operator.");
     AddInput(
         "Label",
-        "(LoDTensor,  LoDTensor<int>). The ground truth with shape "
+        "(LoDTensor,  LoDTensor<int64_t>). The ground truth with shape "
         "[N x 1]. This input is optional. See more details in the operator's "
         "comments.")
         .AsDispensable();
-    AddOutput("ViterbiPath",
-              "(LoDTensor, LoDTensor<int>). The decoding results. What to "
-              "return changes depending on whether the Input(Label) (the groud "
-              "truth) is given. See more details in the operator's comment.");
+    AddOutput(
+        "ViterbiPath",
+        "(LoDTensor, LoDTensor<int64_t>). The decoding results. What to "
+        "return changes depending on whether the Input(Label) (the ground "
+        "truth) is given. See more details in the operator's comment.");
     AddComment(R"DOC(
 The crf_decoding operator reads the emission feature weights and the transition
-freature weights learned by the linear_chain_crf operator. It implements the
+feature weights learned by the linear_chain_crf operator. It implements the
 Viterbi algorithm which is a dynamic programming algorithm for finding the most
 likely sequence of hidden states, called the Viterbi path, that results in a
 sequence of observed tags.
@@ -60,14 +60,14 @@ operator.
 
 When Input(Label) is given, the crf_decoding operator returns a row vector
 with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
-prediction, or 1 indicating a tag is correctly predicted. Such an ouput is the
+prediction, or 1 indicating a tag is correctly predicted. Such an output is the
 input to chunk_eval operator.
 
 2. Input(Label) is not given:
 
 This is the standard decoding process.
 
-The crf_decoding operator returns a row vecotr with shape [N x 1] whose values
+The crf_decoding operator returns a row vector with shape [N x 1] whose values
 range from 0 to maximum tag number - 1. Each element indicates an index of a
 predicted tag.
 )DOC");
@@ -120,11 +120,11 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
-        ctx.device_context());
+        platform::CPUPlace());
   }
 };
 }  // namespace operators
@@ -134,5 +134,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(crf_decoding, ops::CRFDecodingOp,
                              ops::CRFDecodingOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    crf_decoding, ops::CRFDecodingOpKernel<paddle::platform::CPUPlace, float>,
-    ops::CRFDecodingOpKernel<paddle::platform::CPUPlace, double>);
+    crf_decoding,
+    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/crf_decoding_op.h b/paddle/operators/crf_decoding_op.h
index 526e0c5dcb2649b35ee28f5153c8472ca7a0af7b..ce2f4e6622c21e1e5383b4a3aefc2987bf155aec 100644
--- a/paddle/operators/crf_decoding_op.h
+++ b/paddle/operators/crf_decoding_op.h
@@ -24,13 +24,10 @@ using framework::LoDTensor;
 using framework::LoD;
 using framework::Tensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CRFDecodingOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "The crf_decoding operator can only run on CPU.");
-
     auto* emission_weights = ctx.Input<LoDTensor>("Emission");
     auto* transition_weights = ctx.Input<Tensor>("Transition");
     auto* label = ctx.Input<LoDTensor>("Label");
@@ -43,9 +40,9 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     const size_t level = 0;
     const size_t seq_num = lod[level].size() - 1;
 
-    int* path = decoded_path->mutable_data<int>(platform::CPUPlace());
-    math::SetConstant<platform::CPUPlace, int>()(ctx.device_context(),
-                                                 decoded_path, 0);
+    int64_t* path = decoded_path->mutable_data<int64_t>(platform::CPUPlace());
+    math::SetConstant<DeviceContext, int64_t>()(
+        ctx.template device_context<DeviceContext>(), decoded_path, 0);
     for (size_t i = 0; i < seq_num; ++i) {
       int start_pos = static_cast<int>(lod[level][i]);
       int end_pos = static_cast<int>(lod[level][i + 1]);
@@ -57,7 +54,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     if (label) {
       PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL,
                         "The Input(Label) should be a sequence.");
-      const int* label_value = label->data<int>();
+      const int64_t* label_value = label->data<int64_t>();
       size_t batch_size = emission_weights->dims()[0];
       for (size_t i = 0; i < batch_size; ++i) {
         path[i] = label_value[i] == path[i] ? 1 : 0;
@@ -76,7 +73,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
 
     const T* x = emission_weights.data<T>();
     const T* w = transition_weights.data<T>();
-    int* path = decoded_path->data<int>();
+    int64_t* path = decoded_path->data<int64_t>();
 
     // alpha is a memo table. An element alpha(k, v) records the score of the
     // best sequence of tags from position 1 to position k with v being the end
diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
index 6752eb8c1c72150b0b1cf5595211ca1d01ef2bf4..310e351443112c340054cf092cd2443b309ec49c 100644
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/crop_op.h"
 #include <boost/lexical_cast.hpp>
@@ -52,7 +52,7 @@ class CropOp : public framework::OperatorWithKernel {
 
 class CropOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CropOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  CropOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input of pad op. "
@@ -88,7 +88,8 @@ There are two ways to set shape:
 
 The input should be a k-D tensor(k > 0 and k < 7). As an example:
 
-Given:
+Case 1:
+Given
 
     X = [[0, 1, 2, 0, 0]
          [0, 3, 4, 0, 0]
@@ -107,6 +108,27 @@ we get:
     Out = [[1, 2],
            [3, 4]].
 
+
+Case 2:
+Given
+
+    X = [[0, 1, 2, 5, 0]
+         [0, 3, 4, 6, 0]
+         [0, 0, 0, 0, 0]],
+
+and
+
+    offsets = [0, 1],
+
+and
+
+    Y = [[0, 0, 0]
+         [0, 0, 0]],
+
+we get:
+
+    Out = [[1, 2, 5],
+           [3, 4, 6]].
 )DOC");
   }
 };
@@ -133,5 +155,5 @@ class CropOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad);
 REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
-REGISTER_OP_CPU_KERNEL(crop_grad,
-                       ops::CropGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/crop_op.cu b/paddle/operators/crop_op.cu
index f8ee18a1d6e894cbb2d71dd4b6b459abeb076817..bba5db4c6ce682cb00482e35fa1e340aba83e37f 100644
--- a/paddle/operators/crop_op.cu
+++ b/paddle/operators/crop_op.cu
@@ -1,21 +1,21 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/crop_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(crop, ops::CropKernel<float>);
-REGISTER_OP_GPU_KERNEL(crop_grad,
-                       ops::CropGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CUDA_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h
index 2e72583d68d0acf0e2f5044637dba55de3b57209..69d1a92977250b4e8a64b47ac66444724fbc53f6 100644
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 CropdleCropdle Authors. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -49,7 +49,7 @@ class CropKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T, size_t D>
+template <typename DeviceContext, typename T, size_t D>
 void CropGradFunction(const framework::ExecutionContext& context) {
   auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
   if (d_x != nullptr) {
@@ -63,12 +63,13 @@ void CropGradFunction(const framework::ExecutionContext& context) {
     }
     auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
     auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
-    d_x_tensor.device(context.GetEigenDevice<Place>()) =
+    d_x_tensor.device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
         d_out_tensor.pad(paddings, 0);
   }
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CropGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -76,22 +77,22 @@ class CropGradKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
     switch (rank) {
       case 1:
-        CropGradFunction<Place, T, 1>(context);
+        CropGradFunction<DeviceContext, T, 1>(context);
         break;
       case 2:
-        CropGradFunction<Place, T, 2>(context);
+        CropGradFunction<DeviceContext, T, 2>(context);
         break;
       case 3:
-        CropGradFunction<Place, T, 3>(context);
+        CropGradFunction<DeviceContext, T, 3>(context);
         break;
       case 4:
-        CropGradFunction<Place, T, 4>(context);
+        CropGradFunction<DeviceContext, T, 4>(context);
         break;
       case 5:
-        CropGradFunction<Place, T, 5>(context);
+        CropGradFunction<DeviceContext, T, 5>(context);
         break;
       case 6:
-        CropGradFunction<Place, T, 6>(context);
+        CropGradFunction<DeviceContext, T, 6>(context);
         break;
       default:
         PADDLE_THROW(
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 1e82742eaf86711fe4f9d02d517ad1853131cf67..7abd5b1c61d610f4f723b13ad6ce61791b96d06d 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -51,7 +51,7 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of computation kernel of cross_entropy
   // is determined by its input "X".
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
@@ -95,12 +95,13 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                         "Input(Label) should be 1.");
     }
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
   }
 
  protected:
   // Explicitly set that the data type of computation kernel of cross_entropy
   // is determined by its input "X".
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
@@ -110,19 +111,18 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
 
 class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CrossEntropyOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
-             "where N is the batch size and D is the number of classes. "
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x D],"
+             " where N is the batch size and D is the number of classes. "
              "This input is a probability computed by the previous operator, "
              "which is almost always the result of a softmax operator.");
     AddInput("Label",
              "(Tensor), the ground truth which is a 2-D tensor. When "
              "soft_label is set to false, Label is a Tensor<int64> with shape "
              "[N x 1]. When soft_label is set to true, Label is a "
-             "Tensor<float/double> with shape [N x K].");
+             "Tensor<float/double> with shape [N x D].");
     AddOutput("Y",
               "(Tensor, default Tensor<float>), a 2-D tensor with shape "
               "[N x 1]. The cross entropy loss.");
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 6212e39dfde33c5943958adbd1a0a052262e119e..3b04894e6ccb08c13e2d24bb38196fdc7935bf9e 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/cross_entropy_op.h"
 
@@ -53,8 +53,9 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     Tensor* y = ctx.Output<Tensor>("Y");
     y->mutable_data<T>(ctx.GetPlace());
 
-    math::CrossEntropyFunctor<platform::GPUPlace, T>()(
-        ctx.device_context(), y, x, label, ctx.Attr<bool>("soft_label"));
+    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
+        ctx.template device_context<platform::CUDADeviceContext>(), y, x, label,
+        ctx.Attr<bool>("soft_label"));
   }
 };
 
@@ -80,15 +81,17 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
 
     int block = 512;
     int grid = (batch_size * class_num + block - 1) / block;
-    auto stream = ctx.cuda_device_context().stream();
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto stream = dev_ctx.stream();
 
     if (ctx.Attr<bool>("soft_label")) {
       auto* label_data = label->data<T>();
       SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
           dx_data, dy_data, x_data, label_data, batch_size, class_num);
     } else {
-      math::SetConstant<platform::GPUPlace, T> functor;
-      functor(ctx.device_context(), dx, 0);
+      math::SetConstant<platform::CUDADeviceContext, T> functor;
+      functor(dev_ctx, dx, 0);
       auto* label_data = label->data<int64_t>();
       grid = (batch_size + block - 1) / block;
       CrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
@@ -101,8 +104,8 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>,
-                       ops::CrossEntropyOpCUDAKernel<double>);
-REGISTER_OP_GPU_KERNEL(cross_entropy_grad,
-                       ops::CrossEntropyGradientOpCUDAKernel<float>,
-                       ops::CrossEntropyGradientOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>,
+                        ops::CrossEntropyOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
+                        ops::CrossEntropyGradientOpCUDAKernel<float>,
+                        ops::CrossEntropyGradientOpCUDAKernel<double>);
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index 37db0a930a6aea0ba333395ca9c5b9d231c07b32..5623d2ded16daaf51dd26c9d9a8c04a0ae5be5ec 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -37,8 +37,9 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
     Tensor* y = ctx.Output<Tensor>("Y");
     y->mutable_data<T>(ctx.GetPlace());
 
-    math::CrossEntropyFunctor<platform::CPUPlace, T>()(
-        ctx.device_context(), y, x, labels, ctx.Attr<bool>("soft_label"));
+    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
+        ctx.template device_context<platform::CPUDeviceContext>(), y, x, labels,
+        ctx.Attr<bool>("soft_label"));
   }
 };
 
@@ -61,7 +62,8 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
       auto lbl_mat = EigenMatrix<T>::From(*label);
       auto dx_mat = EigenMatrix<T>::From(*dx);
 
-      dx_mat.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
+      dx_mat.device(*ctx.template device_context<platform::CPUDeviceContext>()
+                         .eigen_device()) =
           -(lbl_mat *
             dy_mat.broadcast(Eigen::DSizes<int64_t, 2>(1, class_num)) / x_mat);
     } else {
@@ -70,8 +72,8 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
       const T* x_data = x->data<T>();
       const int64_t* label_data = label->data<int64_t>();
 
-      math::SetConstant<platform::CPUPlace, T> functor;
-      functor(ctx.device_context(), dx, 0);
+      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      functor(ctx.template device_context<platform::CPUDeviceContext>(), dx, 0);
 
       for (int64_t i = 0; i < batch_size; ++i) {
         PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
diff --git a/paddle/operators/decayed_adagrad_op.cc b/paddle/operators/decayed_adagrad_op.cc
index 640b4e77448d1b64bcf7375f26c07ff1d2bdeaa3..739a8d881c35817756421a3299901c9e5e7d96ba 100644
--- a/paddle/operators/decayed_adagrad_op.cc
+++ b/paddle/operators/decayed_adagrad_op.cc
@@ -55,8 +55,7 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
 
 class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  DecayedAdagradOpMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
+  DecayedAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
@@ -99,4 +98,4 @@ REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp,
                              ops::DecayedAdagradOpMaker);
 REGISTER_OP_CPU_KERNEL(
     decayed_adagrad,
-    ops::DecayedAdagradOpKernel<paddle::platform::CPUPlace, float>);
+    ops::DecayedAdagradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/decayed_adagrad_op.cu b/paddle/operators/decayed_adagrad_op.cu
index 6fce77fe4ec6b76cb7b0259aab6a3d55d2edb36c..7bc8161f2339572c2a9284f865846b9b7e594354 100644
--- a/paddle/operators/decayed_adagrad_op.cu
+++ b/paddle/operators/decayed_adagrad_op.cu
@@ -1,21 +1,21 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/decayed_adagrad_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     decayed_adagrad,
-    ops::DecayedAdagradOpKernel<paddle::platform::GPUPlace, float>);
+    ops::DecayedAdagradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/decayed_adagrad_op.h b/paddle/operators/decayed_adagrad_op.h
index 0fe0fc5acd66c9824a864618b69097c5c063ea3f..fec9705cfc1e14e5423e23d6afb218c6c051f5a1 100644
--- a/paddle/operators/decayed_adagrad_op.h
+++ b/paddle/operators/decayed_adagrad_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class DecayedAdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -43,7 +43,7 @@ class DecayedAdagradOpKernel : public framework::OpKernel<T> {
 
     auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
     auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     moment_out.device(place) = decay * moment + (1 - decay) * grad * grad;
     Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
diff --git a/paddle/operators/detail/CMakeLists.txt b/paddle/operators/detail/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..571a75c9dcd903672d460f192bf28ddbeaea7c78
--- /dev/null
+++ b/paddle/operators/detail/CMakeLists.txt
@@ -0,0 +1 @@
+grpc_library(sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
diff --git a/paddle/operators/detail/grpc_client.cc b/paddle/operators/detail/grpc_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aee56ffe018aa8d0d2106df24bd9358c930a02ca
--- /dev/null
+++ b/paddle/operators/detail/grpc_client.cc
@@ -0,0 +1,153 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "grpc_client.h"
+namespace paddle {
+namespace operators {
+namespace detail {
+
+bool RPCClient::AsyncSendVariable(const std::string& ep,
+                                  const platform::DeviceContext& ctx,
+                                  const framework::Scope& scope,
+                                  const std::string& var_name,
+                                  int64_t time_out) {
+  sendrecv::VariableMessage req;
+  auto* var = scope.FindVar(var_name);
+  SerializeToMessage(var_name, var, ctx, &req);
+
+  // varhandle
+  VarHandle var_h;
+  var_h.ep = ep;
+  var_h.scope = &scope;
+  var_h.name = var_name;
+  var_h.ctx = &ctx;
+
+  // stub context
+  auto ch = GetChannel(ep);
+  SendProcessor* s = new SendProcessor(ch);
+  s->Prepare(var_h, time_out);
+  s->response_call_back_ = NULL;
+
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+
+  req_count_++;
+
+  return true;
+}
+
+void ProcGetResponse(const VarHandle& var_h,
+                     const sendrecv::VariableMessage& ret_msg) {
+  auto* outvar = var_h.scope->FindVar(var_h.name);
+
+  std::istringstream iss(ret_msg.serialized());
+  DeserializeFromMessage(ret_msg, *var_h.ctx, outvar);
+}
+
+bool RPCClient::AsyncGetVariable(const std::string& ep,
+                                 const platform::DeviceContext& ctx,
+                                 const framework::Scope& scope,
+                                 const std::string& var_name,
+                                 int64_t time_out) {
+  sendrecv::VariableMessage req;
+  req.set_varname(var_name);
+
+  auto* var = scope.FindVar(var_name);
+  SerializeToMessage(var_name, var, ctx, &req);
+
+  // varhandle
+  VarHandle var_h;
+  var_h.ep = ep;
+  var_h.scope = &scope;
+  var_h.name = var_name;
+  var_h.ctx = &ctx;
+
+  // stub context
+  auto ch = GetChannel(ep);
+  GetProcessor* s = new GetProcessor(ch);
+  s->Prepare(var_h, time_out);
+  s->response_call_back_ = ProcGetResponse;
+
+  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+
+  req_count_++;
+
+  return true;
+}
+
+bool RPCClient::Wait() {
+  bool ok = true;
+
+  while (true) {
+    if (req_count_ <= 0) {
+      break;
+    }
+
+    if (!Proceed()) {
+      return false;
+    }
+  }
+
+  return ok;
+}
+
+bool RPCClient::Proceed() {
+  void* tag = NULL;
+  bool ok = false;
+
+  // request counts.
+  if (!cq_.Next(&tag, &ok)) {
+    LOG(ERROR) << "Get meets CompletionQueue error";
+    return false;
+  }
+
+  GPR_ASSERT(ok);
+  PADDLE_ENFORCE(tag);
+
+  // TODO(gongwb): add more retries.
+  ClientBase* c = static_cast<ClientBase*>(tag);
+  if (!c->status_.ok()) {
+    LOG(ERROR) << "proc param error:" << c->var_h_.String()
+               << " grpc error:" << c->status_.error_message();
+    delete c;
+    return false;
+  }
+
+  c->Process();
+  delete c;
+  req_count_--;
+  return true;
+}
+
+std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
+  auto it = channels_.find(ep);
+  if (it != channels_.end()) {
+    return it->second;
+  }
+
+  grpc::ChannelArguments args;
+  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
+  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
+
+  auto ch = std::shared_ptr<grpc::Channel>(
+      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args));
+
+  channels_[ep] = ch;
+  return ch;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/grpc_client.h b/paddle/operators/detail/grpc_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..a62e70a2533ae52d84d010504b19fed5aeb15dc0
--- /dev/null
+++ b/paddle/operators/detail/grpc_client.h
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <grpc++/grpc++.h>
+#include <grpc/support/log.h>
+#include <time.h>
+#include <chrono>
+#include <ctime>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/operators/detail/sendrecvop_utils.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+struct VarHandle {
+  std::string ep;
+  const platform::DeviceContext* ctx;
+  const framework::Scope* scope;
+  std::string name;
+
+  std::string String() const {
+    std::ostringstream s;
+    s << "name:[" << name << "] ep:[" << ep << "]";
+    return s.str();
+  }
+};
+
+void ProcGetResponse(const VarHandle& var_h,
+                     const sendrecv::VariableMessage& msg);
+
+class ClientBase {
+ public:
+  explicit ClientBase(std::shared_ptr<grpc::Channel> ch) {
+    stub_ = sendrecv::SendRecvService::NewStub(ch);
+    context_ = NULL;
+  }
+
+  virtual ~ClientBase() {}
+
+  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
+    context_.reset(new grpc::ClientContext());
+    var_h_ = var_info;
+
+    std::chrono::system_clock::time_point deadline =
+        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
+
+    context_->set_deadline(deadline);
+  }
+
+  virtual void Process() = 0;
+
+  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
+  std::unique_ptr<grpc::ClientContext> context_;
+  grpc::Status status_;
+  VarHandle var_h_;
+};
+
+typedef std::function<void(const VarHandle&, const sendrecv::VoidMessage&)>
+    RequestSendCallBack;
+
+class SendProcessor : public ClientBase {
+ public:
+  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch) : ClientBase(ch) {}
+
+  virtual ~SendProcessor() {}
+
+  virtual void Process() {
+    if (response_call_back_) {
+      response_call_back_(var_h_, reply_);
+    }
+  }
+
+  sendrecv::VoidMessage reply_;
+  RequestSendCallBack response_call_back_ = NULL;
+};
+
+typedef std::function<void(const VarHandle&, const sendrecv::VariableMessage&)>
+    RequestGetCallBack;
+
+class GetProcessor : public ClientBase {
+ public:
+  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch) : ClientBase(ch) {}
+
+  virtual ~GetProcessor() {}
+
+  virtual void Process() {
+    if (response_call_back_) {
+      response_call_back_(var_h_, reply_);
+    }
+  }
+
+  sendrecv::VariableMessage reply_;
+  RequestGetCallBack response_call_back_ = ProcGetResponse;
+};
+
+class RPCClient {
+ public:
+  bool AsyncSendVariable(const std::string& ep,
+                         const platform::DeviceContext& ctx,
+                         const framework::Scope& scope,
+                         const std::string& var_name,
+                         int64_t time_out = 600 * 1000);
+
+  bool AsyncGetVariable(const std::string& ep,
+                        const platform::DeviceContext& ctx,
+                        const framework::Scope& scope,
+                        const std::string& var_name,
+                        int64_t time_out = 600 * 1000);
+  bool Wait();
+
+ private:
+  bool Proceed();
+  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
+
+ private:
+  grpc::CompletionQueue cq_;
+  std::map<std::string, std::shared_ptr<grpc::Channel>> channels_;
+  int64_t req_count_ = 0;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/grpc_server.cc b/paddle/operators/detail/grpc_server.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c0b94746a0b7f6ffb657bbf5af18360426933858
--- /dev/null
+++ b/paddle/operators/detail/grpc_server.cc
@@ -0,0 +1,245 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/detail/grpc_server.h"
+
+using grpc::ServerAsyncResponseWriter;
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+enum CallStatus { PROCESS = 0, FINISH };
+
+// reference:
+// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
+class RequestBase {
+ public:
+  explicit RequestBase(sendrecv::SendRecvService::AsyncService* service,
+                       grpc::ServerCompletionQueue* cq)
+      : service_(service), cq_(cq), status_(PROCESS) {
+    PADDLE_ENFORCE(cq_);
+  }
+  virtual ~RequestBase() {}
+  virtual void Process() { assert(false); }
+
+  CallStatus Status() { return status_; }
+  void SetStatus(CallStatus status) { status_ = status; }
+  virtual std::string GetReqName() { assert(false); }
+
+ protected:
+  grpc::ServerContext ctx_;
+  sendrecv::SendRecvService::AsyncService* service_;
+  grpc::ServerCompletionQueue* cq_;
+  CallStatus status_;
+};
+
+typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
+
+class RequestSend final : public RequestBase {
+ public:
+  explicit RequestSend(sendrecv::SendRecvService::AsyncService* service,
+                       grpc::ServerCompletionQueue* cq,
+                       SimpleBlockQueue<MessageWithName>* queue)
+      : RequestBase(service, cq), queue_(queue), responder_(&ctx_) {
+    service_->RequestSendVariable(&ctx_, &request_, &responder_, cq_, cq_,
+                                  this);
+  }
+
+  virtual ~RequestSend() {}
+
+  virtual std::string GetReqName() { return request_.varname(); }
+
+  virtual void Process() {
+    MessageWithName msg_with_name =
+        std::make_pair(request_.varname(), std::move(request_));
+    queue_->Push(std::move(msg_with_name));
+    responder_.Finish(reply_, grpc::Status::OK, this);
+    status_ = FINISH;
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  sendrecv::VoidMessage reply_;
+  SimpleBlockQueue<MessageWithName>* queue_;
+  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
+};
+
+class RequestGet final : public RequestBase {
+ public:
+  explicit RequestGet(sendrecv::SendRecvService::AsyncService* service,
+                      grpc::ServerCompletionQueue* cq, framework::Scope* scope,
+                      const platform::DeviceContext* dev_ctx)
+      : RequestBase(service, cq),
+        responder_(&ctx_),
+        scope_(scope),
+        dev_ctx_(dev_ctx) {
+    service_->RequestGetVariable(&ctx_, &request_, &responder_, cq_, cq_, this);
+  }
+
+  virtual ~RequestGet() {}
+
+  virtual std::string GetReqName() { return request_.varname(); }
+
+  virtual void Process() {
+    // proc request.
+    std::string var_name = request_.varname();
+    auto* var = scope_->FindVar(var_name);
+    SerializeToMessage(var_name, var, *dev_ctx_, &reply_);
+    // TODO(gongwb): check var's info.
+    responder_.Finish(reply_, grpc::Status::OK, this);
+    status_ = FINISH;
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  sendrecv::VariableMessage reply_;
+  ServerAsyncResponseWriter<sendrecv::VariableMessage> responder_;
+  framework::Scope* scope_;
+  const platform::DeviceContext* dev_ctx_;
+};
+
+void AsyncGRPCServer::RunSyncUpdate() {
+  grpc::ServerBuilder builder;
+  builder.AddListeningPort(address_, grpc::InsecureServerCredentials());
+  builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
+  builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
+  builder.RegisterService(&service_);
+
+  cq_send_ = builder.AddCompletionQueue();
+  cq_get_ = builder.AddCompletionQueue();
+  server_ = builder.BuildAndStart();
+  LOG(INFO) << "Server listening on " << address_ << std::endl;
+
+  std::function<void()> send_register =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
+  std::function<void()> get_register =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
+
+  t_send_.reset(
+      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, false,
+                                cq_send_.get(), "cq_send", send_register)));
+
+  t_get_.reset(
+      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, true,
+                                cq_get_.get(), "cq_get", get_register)));
+
+  // wait server
+  server_->Wait();
+  t_send_->join();
+  t_get_->join();
+}
+
+void AsyncGRPCServer::ShutdownQueue() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  cq_send_->Shutdown();
+  cq_get_->Shutdown();
+  is_shut_down_ = true;
+}
+
+// This URL explains why shutdown is complicate:
+// https://stackoverflow.com/questions/35708348/grpc-what-is-the-recommended-way-to-shut-down-an-asynchronous-server-in-c
+void AsyncGRPCServer::ShutDown() {
+  server_->Shutdown();
+  ShutdownQueue();
+}
+
+void AsyncGRPCServer::TryToRegisterNewSendOne() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    return;
+  }
+  RequestSend* send =
+      new RequestSend(&service_, cq_send_.get(), &var_recv_queue_);
+  VLOG(4) << "create RequestSend status:" << send->Status();
+}
+
+void AsyncGRPCServer::TryToRegisterNewGetOne() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    return;
+  }
+  RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_);
+  VLOG(4) << "create Requestget status:" << get->Status();
+}
+
+void AsyncGRPCServer::HandleRequest(bool wait, grpc::ServerCompletionQueue* cq,
+                                    std::string cq_name,
+                                    std::function<void()> TryToRegisterNewOne) {
+  TryToRegisterNewOne();
+
+  void* tag = NULL;
+  bool ok = false;
+  while (true) {
+    if (!cq->Next(&tag, &ok)) {
+      LOG(INFO) << cq_name << " get CompletionQueue shutdown!";
+      break;
+    }
+
+    PADDLE_ENFORCE(tag);
+    if (wait && !done_) {
+      Wait();
+    }
+
+    RequestBase* base = (RequestBase*)tag;
+    // reference:
+    // https://github.com/tensorflow/tensorflow/issues/5596
+    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
+    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
+    if (!ok) {
+      LOG(WARNING) << cq_name << " recv no regular event:argument name"
+                   << base->GetReqName();
+      TryToRegisterNewOne();
+      delete base;
+      continue;
+    }
+
+    switch (base->Status()) {
+      case PROCESS: {
+        VLOG(4) << cq_name << " status:" << base->Status();
+        TryToRegisterNewOne();
+        base->Process();
+        break;
+      }
+      case FINISH: {
+        VLOG(4) << cq_name << " status:" << base->Status();
+        delete base;
+        break;
+      }
+      default: { assert(false); }
+    }
+  }
+}
+
+void AsyncGRPCServer::Wait() {
+  std::unique_lock<std::mutex> lock(this->mutex_);
+  condition_.wait(lock, [=] { return this->done_ == true; });
+}
+
+void AsyncGRPCServer::Reset() {
+  std::lock_guard<std::mutex> lock(this->mutex_);
+  done_ = false;
+}
+
+void AsyncGRPCServer::Done() {
+  {
+    std::lock_guard<std::mutex> lock(this->mutex_);
+    done_ = true;
+  }
+  condition_.notify_all();
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/grpc_server.h b/paddle/operators/detail/grpc_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c078b77771656dc7fc0342ecf21b8d33dc11817
--- /dev/null
+++ b/paddle/operators/detail/grpc_server.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/framework/var_type.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+#include "paddle/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/operators/detail/send_recv.pb.h"
+
+#include <grpc++/grpc++.h>
+#include <grpc/support/log.h>
+#include <thread>
+#include "paddle/operators/detail/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
+class RequestBase;
+
+class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
+ public:
+  explicit AsyncGRPCServer(const std::string &address) : address_(address) {}
+
+  void RunSyncUpdate();
+
+  void Reset();
+
+  void Done();
+
+  void SetScope(framework::Scope *scope) { scope_ = scope; }
+
+  void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
+
+  const MessageWithName Get() { return this->var_recv_queue_.Pop(); }
+
+  void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); }
+
+  void ShutDown();
+
+ protected:
+  void Wait();
+  void HandleRequest(bool wait, grpc::ServerCompletionQueue *cq,
+                     std::string cq_name,
+                     std::function<void()> TryToRegisterNewOne);
+  void TryToRegisterNewSendOne();
+  void TryToRegisterNewGetOne();
+  void ShutdownQueue();
+
+ private:
+  std::mutex cq_mutex_;
+  volatile bool is_shut_down_ = false;
+  std::unique_ptr<grpc::ServerCompletionQueue> cq_send_;
+  std::unique_ptr<grpc::ServerCompletionQueue> cq_get_;
+
+  sendrecv::SendRecvService::AsyncService service_;
+  std::unique_ptr<grpc::Server> server_;
+
+  std::string address_;
+  framework::Scope *scope_;
+  const platform::DeviceContext *dev_ctx_;
+  // received variable from RPC, operators fetch variable from this queue.
+  SimpleBlockQueue<MessageWithName> var_recv_queue_;
+
+  // condition of the sub program
+  std::mutex mutex_;
+  volatile mutable bool done_;
+  std::condition_variable condition_;
+
+  std::unique_ptr<std::thread> t_send_;
+  std::unique_ptr<std::thread> t_get_;
+};
+
+};  // namespace detail
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/operators/detail/safe_ref.h b/paddle/operators/detail/safe_ref.h
index b71af17309f9f46b5c87f0f479d4e03443fa7f93..ff2a156f3d0e3c5c55354375ca81adf433a49686 100644
--- a/paddle/operators/detail/safe_ref.h
+++ b/paddle/operators/detail/safe_ref.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto
new file mode 100644
index 0000000000000000000000000000000000000000..8f962b4c69cc83dc2ab98b7dc27e18bc4b42bf18
--- /dev/null
+++ b/paddle/operators/detail/send_recv.proto
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
+the Apache License, Version 2.0 (the "License"); you may not use this file
+except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto3";
+package sendrecv;
+
+service SendRecvService {
+  // For parameter server round-robin like hashing, do not split tensors.
+  // Send and recv only one tensor
+  // TODO(typhoonzero): add streaming API
+  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
+  // Argument VariableMessage for GetVariable should only contain varname.
+  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+}
+
+// VariableMessage is serialized paddle variable message.
+// It can be:
+// LoDTensor
+// SelectedRows
+enum VarType {
+  LOD_TENSOR = 0;
+  SELECTED_ROWS = 1;
+}
+
+message VariableMessage {
+  string varname = 1;
+  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
+  VarType type = 2;
+  bytes serialized = 3;
+}
+
+message VoidMessage {}
diff --git a/paddle/operators/detail/sendrecvop_utils.cc b/paddle/operators/detail/sendrecvop_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7635b9e8dbdff624bb42a9de346b8d05a980f9b6
--- /dev/null
+++ b/paddle/operators/detail/sendrecvop_utils.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/detail/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+void SerializeToMessage(const std::string& name, const framework::Variable* var,
+                        const platform::DeviceContext& ctx,
+                        sendrecv::VariableMessage* msg) {
+  msg->set_varname(name);
+  std::ostringstream oss;
+  switch (framework::ToVarType(var->Type())) {
+    case framework::proto::VarDesc_VarType_LOD_TENSOR:
+      msg->set_type(sendrecv::VarType::LOD_TENSOR);
+      framework::SerializeToStream(oss, var->Get<framework::LoDTensor>(), ctx);
+      break;
+    case framework::proto::VarDesc_VarType_SELECTED_ROWS:
+      msg->set_type(sendrecv::VarType::SELECTED_ROWS);
+      framework::SerializeToStream(oss, var->Get<framework::SelectedRows>(),
+                                   ctx);
+      break;
+    default: {
+      PADDLE_THROW("Serialize does not support type: %s",
+                   typeid(var->Type()).name());
+      break;
+    }
+  }
+  msg->set_serialized(oss.str());
+}
+
+void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
+                            const platform::DeviceContext& ctx,
+                            framework::Variable* var) {
+  std::istringstream iss(msg.serialized());
+  switch (msg.type()) {
+    case sendrecv::VarType::LOD_TENSOR:
+      DeserializeFromStream(iss, var->GetMutable<framework::LoDTensor>(), ctx);
+      break;
+    case sendrecv::VarType::SELECTED_ROWS: {
+      DeserializeFromStream(iss, var->GetMutable<framework::SelectedRows>(),
+                            ctx);
+      break;
+    }
+    default: {
+      PADDLE_THROW("Deserialize does not support type: %s",
+                   typeid(var->Type()).name());
+      break;
+    }
+  }
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/sendrecvop_utils.h b/paddle/operators/detail/sendrecvop_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc6581afab93c626c7c2439d699c6c2d858df9fa
--- /dev/null
+++ b/paddle/operators/detail/sendrecvop_utils.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/framework/var_type.h"
+
+#include "paddle/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/operators/detail/send_recv.pb.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+void SerializeToMessage(const std::string& name, const framework::Variable* var,
+                        const platform::DeviceContext& ctx,
+                        sendrecv::VariableMessage* msg);
+
+void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
+                            const platform::DeviceContext& ctx,
+                            framework::Variable* var);
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/simple_block_queue.h b/paddle/operators/detail/simple_block_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7f5ff4b5f494ce80b4ee792afb44841e9c8a2f2
--- /dev/null
+++ b/paddle/operators/detail/simple_block_queue.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+template <typename T>
+class SimpleBlockQueue {
+ private:
+  std::mutex mutex_;
+  std::condition_variable condition_;
+  std::deque<T> queue_;
+
+ public:
+  void Push(T const& value) {
+    {
+      std::unique_lock<std::mutex> lock(this->mutex_);
+      queue_.push_front(value);
+    }
+    this->condition_.notify_one();
+  }
+
+  T Pop() {
+    std::unique_lock<std::mutex> lock(this->mutex_);
+    this->condition_.wait(lock, [=] { return !this->queue_.empty(); });
+    T rc(std::move(this->queue_.back()));
+    this->queue_.pop_back();
+    return rc;
+  }
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/strided_memcpy.h b/paddle/operators/detail/strided_memcpy.h
index 068c82f399316a1587d7322d8dab75823656800e..9ed524d4dcf7f8bd4607281ade34e9d56f409085 100644
--- a/paddle/operators/detail/strided_memcpy.h
+++ b/paddle/operators/detail/strided_memcpy.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/ddim.h"
@@ -35,7 +35,7 @@ struct StridedMemcpyFunctor<T, 1> {
       memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
     } else {
 #ifdef PADDLE_WITH_CUDA
-      auto& gpu_place = boost::get<platform::GPUPlace>(place);
+      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
       auto& cuda_ctx =
           reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
       memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head,
diff --git a/paddle/operators/detection_output_op.cc b/paddle/operators/detection_output_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea44cd32678d7e8a5836c1886cf9c1b4961970aa
--- /dev/null
+++ b/paddle/operators/detection_output_op.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/detection_output_op.h"
+namespace paddle {
+namespace operators {
+
+class DetectionOutputOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  DetectionOutputOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Loc",
+             "(Tensor) The input tensor of detection_output operator."
+             "The input predict locations"
+             "The format of input tensor is kNCHW. Where K is priorbox point "
+             "numbers,"
+             "N is How many boxes are there on each point, "
+             "C is 4, H and W both are 1.");
+    AddInput("Conf",
+             "(Tensor) The input tensor of detection_output operator."
+             "The input priorbox confidence."
+             "The format of input tensor is kNCHW. Where K is priorbox point "
+             "numbers,"
+             "N is How many boxes are there on each point, "
+             "C is the number of classes, H and W both are 1.");
+    AddInput("PriorBox",
+             "(Tensor) The input tensor of detection_output operator."
+             "The format of input tensor is the position and variance "
+             "of the boxes");
+    AddOutput("Out",
+              "(Tensor) The output tensor of detection_output operator.");
+    AddAttr<int>("background_label_id", "(int), The background class index.");
+    AddAttr<int>("num_classes", "(int), The number of the classification.");
+    AddAttr<float>("nms_threshold",
+                   "(float), The Non-maximum suppression threshold.");
+    AddAttr<float>("confidence_threshold",
+                   "(float), The classification confidence threshold.");
+    AddAttr<int>("top_k", "(int), The bbox number kept of the layer’s output.");
+    AddAttr<int>("nms_top_k",
+                 "(int), The bbox number kept of the NMS’s output.");
+    AddComment(R"DOC(
+          detection output for SSD(single shot multibox detector)
+          Apply the NMS to the output of network and compute the predict
+          bounding box location. The output’s shape of this layer could
+          be zero if there is no valid bounding box.
+        )DOC");
+  }
+};
+
+class DetectionOutputOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Loc"),
+                   "Input(X) of DetectionOutputOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Conf"),
+                   "Input(X) of DetectionOutputOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
+                   "Input(X) of DetectionOutputOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of DetectionOutputOp should not be null.");
+    std::vector<int64_t> output_shape({1, 7});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::DetectionOutputOp,
+                             ops::DetectionOutputOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    detection_output,
+    ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/detection_output_op.cu.cc b/paddle/operators/detection_output_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4a6560e0492c559afd06e5152c34fab545b7ce61
--- /dev/null
+++ b/paddle/operators/detection_output_op.cu.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/detection_output_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    detection_output,
+    ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/detection_output_op.h b/paddle/operators/detection_output_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..86285b748a7fe57437ddaa8cc4262d9ac1b0403a
--- /dev/null
+++ b/paddle/operators/detection_output_op.h
@@ -0,0 +1,167 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/operators/math/detection_util.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/softmax.h"
+#include "paddle/operators/strided_memcpy.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+inline void transpose_fun(const framework::ExecutionContext& context,
+                          const framework::Tensor& src,
+                          framework::Tensor* dst) {
+  int input_nums = src.dims()[0];
+  int offset = 0;
+  for (int j = 0; j < input_nums; ++j) {
+    framework::Tensor in_p_tensor = src.Slice(j, j + 1);
+    std::vector<int64_t> shape_vec(
+        {in_p_tensor.dims()[0], in_p_tensor.dims()[1], in_p_tensor.dims()[3],
+         in_p_tensor.dims()[4], in_p_tensor.dims()[2]});
+    framework::DDim shape(framework::make_ddim(shape_vec));
+    framework::Tensor in_p_tensor_transpose;
+    in_p_tensor_transpose.mutable_data<T>(shape, context.GetPlace());
+    std::vector<int> shape_axis({0, 1, 3, 4, 2});
+    math::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context.template device_context<DeviceContext>(), in_p_tensor,
+           &in_p_tensor_transpose, shape_axis);
+    auto dst_stride = framework::stride(dst->dims());
+    auto src_stride = framework::stride(in_p_tensor_transpose.dims());
+    StridedMemcpy<T>(context.device_context(), in_p_tensor_transpose.data<T>(),
+                     src_stride, in_p_tensor_transpose.dims(), dst_stride,
+                     dst->data<T>() + offset);
+    offset += in_p_tensor_transpose.dims()[4] * src_stride[4];
+  }
+}
+template <typename DeviceContext, typename T>
+class DetectionOutputKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_loc = context.Input<framework::Tensor>("Loc");
+    const framework::Tensor* in_conf = context.Input<framework::Tensor>("Conf");
+    const framework::Tensor* in_priorbox =
+        context.Input<framework::Tensor>("PriorBox");
+    auto* out = context.Output<framework::Tensor>("Out");
+    int num_classes = context.template Attr<int>("num_classes");
+    int top_k = context.template Attr<int>("top_k");
+    int nms_top_k = context.template Attr<int>("nms_top_k");
+    int background_label_id = context.template Attr<int>("background_label_id");
+    float nms_threshold = context.template Attr<float>("nms_threshold");
+    float confidence_threshold =
+        context.template Attr<float>("confidence_threshold");
+    size_t batch_size = in_conf->dims()[1];
+    int conf_sum_size = in_conf->numel();
+    // for softmax
+    std::vector<int64_t> conf_shape_softmax_vec(
+        {conf_sum_size / num_classes, num_classes});
+    framework::DDim conf_shape_softmax(
+        framework::make_ddim(conf_shape_softmax_vec));
+    // for knchw => nhwc
+    std::vector<int64_t> loc_shape_vec({1, in_loc->dims()[1], in_loc->dims()[3],
+                                        in_loc->dims()[4],
+                                        in_loc->dims()[2] * in_loc->dims()[0]});
+    std::vector<int64_t> conf_shape_vec(
+        {1, in_conf->dims()[1], in_conf->dims()[3], in_conf->dims()[4],
+         in_conf->dims()[2] * in_conf->dims()[0]});
+    framework::DDim loc_shape(framework::make_ddim(loc_shape_vec));
+    framework::DDim conf_shape(framework::make_ddim(conf_shape_vec));
+    framework::Tensor loc_tensor;
+    framework::Tensor conf_tensor;
+    loc_tensor.mutable_data<T>(loc_shape, context.GetPlace());
+    conf_tensor.mutable_data<T>(conf_shape, context.GetPlace());
+    // for cpu
+    framework::Tensor loc_cpu;
+    framework::Tensor conf_cpu;
+    framework::Tensor priorbox_cpu;
+    const T* priorbox_data = in_priorbox->data<T>();
+    transpose_fun<DeviceContext, T>(context, *in_loc, &loc_tensor);
+    transpose_fun<DeviceContext, T>(context, *in_conf, &conf_tensor);
+    conf_tensor.Resize(conf_shape_softmax);
+    math::SoftmaxFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), &conf_tensor,
+        &conf_tensor);
+    T* loc_data = loc_tensor.data<T>();
+    T* conf_data = conf_tensor.data<T>();
+    if (platform::is_gpu_place(context.GetPlace())) {
+      loc_cpu.mutable_data<T>(loc_tensor.dims(), platform::CPUPlace());
+      framework::Copy(loc_tensor, platform::CPUPlace(),
+                      context.device_context(), &loc_cpu);
+      loc_data = loc_cpu.data<T>();
+      conf_cpu.mutable_data<T>(conf_tensor.dims(), platform::CPUPlace());
+      framework::Copy(conf_tensor, platform::CPUPlace(),
+                      context.device_context(), &conf_cpu);
+      conf_data = conf_cpu.data<T>();
+      priorbox_cpu.mutable_data<T>(in_priorbox->dims(), platform::CPUPlace());
+      framework::Copy(*in_priorbox, platform::CPUPlace(),
+                      context.device_context(), &priorbox_cpu);
+      priorbox_data = priorbox_cpu.data<T>();
+    }
+    // get decode bboxes
+    size_t num_priors = in_priorbox->numel() / 8;
+    std::vector<std::vector<operators::math::BBox<T>>> all_decoded_bboxes;
+    for (size_t n = 0; n < batch_size; ++n) {
+      std::vector<operators::math::BBox<T>> decoded_bboxes;
+      for (size_t i = 0; i < num_priors; ++i) {
+        size_t prior_offset = i * 8;
+        size_t loc_pred_offset = n * num_priors * 4 + i * 4;
+        std::vector<math::BBox<T>> prior_bbox_vec;
+        math::GetBBoxFromPriorData<T>(priorbox_data + prior_offset, 1,
+                                      prior_bbox_vec);
+        std::vector<std::vector<T>> prior_bbox_var;
+        math::GetBBoxVarFromPriorData<T>(priorbox_data + prior_offset, 1,
+                                         prior_bbox_var);
+        std::vector<T> loc_pred_data;
+        for (size_t j = 0; j < 4; ++j)
+          loc_pred_data.push_back(*(loc_data + loc_pred_offset + j));
+        math::BBox<T> bbox = math::DecodeBBoxWithVar<T>(
+            prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data);
+        decoded_bboxes.push_back(bbox);
+      }
+      all_decoded_bboxes.push_back(decoded_bboxes);
+    }
+    std::vector<std::map<size_t, std::vector<size_t>>> all_indices;
+    int num_kept = math::GetDetectionIndices<T>(
+        conf_data, num_priors, num_classes, background_label_id, batch_size,
+        confidence_threshold, nms_top_k, nms_threshold, top_k,
+        all_decoded_bboxes, &all_indices);
+
+    if (num_kept <= 0) {
+      std::vector<int64_t> out_shape_vec({0, 0});
+      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+      out->Resize(out_shape);
+      return;
+    }
+    std::vector<int64_t> out_shape_vec({num_kept, 7});
+    framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+    out->mutable_data<T>(out_shape, context.GetPlace());
+    framework::Tensor out_cpu;
+    T* out_data = out->data<T>();
+    if (platform::is_gpu_place(context.GetPlace())) {
+      out_cpu.mutable_data<T>(out->dims(), platform::CPUPlace());
+      out_data = out_cpu.data<T>();
+    }
+    math::GetDetectionOutput<T>(conf_data, num_kept, num_priors, num_classes,
+                                batch_size, all_indices, all_decoded_bboxes,
+                                out_data);
+    if (platform::is_gpu_place(context.GetPlace())) {
+      framework::Copy(out_cpu, platform::CUDAPlace(), context.device_context(),
+                      out);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index 818146aca766cb13b93fd024c11c1209655d9e11..35cb18797ff66cb87a6658e73ce02b0bfae29baa 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/dropout_op.h"
 
@@ -25,12 +25,10 @@ class DropoutOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE_GE(ctx->Attrs().Get<float>("dropout_prob"), 0);
-    PADDLE_ENFORCE_LE(ctx->Attrs().Get<float>("dropout_prob"), 1);
 
     auto x_dims = ctx->GetInputDim("X");
     ctx->SetOutputDim("Out", x_dims);
-    if (ctx->Attrs().Get<bool>("is_training") == true) {
+    if (ctx->Attrs().Get<bool>("is_test") == false) {
       ctx->SetOutputDim("Mask", x_dims);
     }
     ctx->ShareLoD("X", /*->*/ "Out");
@@ -40,16 +38,19 @@ class DropoutOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  DropoutOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of dropout op.");
     AddOutput("Out", "The output of dropout op.");
     AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate();
 
     AddAttr<float>("dropout_prob", "Probability of setting units to zero.")
-        .SetDefault(.5f);
-    AddAttr<bool>("is_training", "True if in training phase.").SetDefault(true);
+        .SetDefault(.5f)
+        .AddCustomChecker([](const float& drop_p) {
+          PADDLE_ENFORCE(drop_p >= 0.0f && drop_p <= 1.0f,
+                         "'dropout_prob' must be between 0.0 and 1.0.");
+        });
+    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
     AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
 
     AddComment(R"DOC(
@@ -71,16 +72,14 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_training"), true,
-                      "GradOp is only callable when is_training is true");
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
+                      "GradOp is only callable when is_test is false");
 
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Mask"), "Mask must not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) must not be null.");
 
-    PADDLE_ENFORCE_GE(ctx->Attrs().Get<float>("dropout_prob"), 0);
-    PADDLE_ENFORCE_LE(ctx->Attrs().Get<float>("dropout_prob"), 1);
     auto x_dims = ctx->GetInputDim("X");
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
     PADDLE_ENFORCE_EQ(x_dims, out_dims,
@@ -100,6 +99,8 @@ namespace ops = paddle::operators;
 REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad,
             ops::DropoutOpGrad<float>);
 REGISTER_OP_CPU_KERNEL(
-    dropout, ops::CPUDropoutKernel<paddle::platform::CPUPlace, float, float>);
+    dropout,
+    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float, float>);
 REGISTER_OP_CPU_KERNEL(
-    dropout_grad, ops::DropoutGradKernel<paddle::platform::CPUPlace, float>);
+    dropout_grad,
+    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
index 30c769000f2b98c69eaa78a4c139630dd0956386..c56930336e865079f1b96df0f35b0a051fe63a27 100644
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include <thrust/device_ptr.h>
@@ -30,16 +30,15 @@ struct MaskGenerator {
   __host__ __device__ MaskGenerator(AttrType dropout_prob, int seed)
       : dropout_prob(dropout_prob), seed(seed) {}
 
-  __host__ __device__ T operator()(const unsigned int n) const {
+  inline __host__ __device__ T operator()(const unsigned int n) const {
     thrust::minstd_rand rng;
     rng.seed(seed);
     thrust::uniform_real_distribution<AttrType> dist(0, 1);
     rng.discard(n);
     if (dist(rng) < dropout_prob) {
       return static_cast<T>(0);
-    } else {
-      return static_cast<T>(1);
     }
+    return static_cast<T>(1);
   }
 };
 
@@ -58,8 +57,8 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
     auto X = EigenMatrix<T>::Reshape(*x, 1);
     auto Y = EigenMatrix<T>::Reshape(*y, 1);
 
-    auto place = context.GetEigenDevice<Place>();
-    if (context.Attr<bool>("is_training")) {
+    auto& place = *context.template device_context<Place>().eigen_device();
+    if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
       int size = framework::product(mask->dims());
@@ -71,7 +70,7 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
       auto M = EigenMatrix<T>::Reshape(*mask, 1);
       Y.device(place) = X * M;
     } else {
-      Y.device(place) = X * dropout_prob;
+      Y.device(place) = X * (1.0f - dropout_prob);
     }
   }
 };
@@ -80,7 +79,9 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    dropout, ops::GPUDropoutKernel<paddle::platform::GPUPlace, float, float>);
-REGISTER_OP_GPU_KERNEL(
-    dropout_grad, ops::DropoutGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    dropout,
+    ops::GPUDropoutKernel<paddle::platform::CUDADeviceContext, float, float>);
+REGISTER_OP_CUDA_KERNEL(
+    dropout_grad,
+    ops::DropoutGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h
index 6000b75fecdff74844605215e9364ac8f8a1525a..c90b8d277eb78048c001d36a367287146b51c636 100644
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <random>
@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T, typename AttrType>
+template <typename DeviceContext, typename T, typename AttrType>
 class CPUDropoutKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -35,7 +35,7 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
     auto* y_data = y->mutable_data<T>(context.GetPlace());
     float dropout_prob = context.Attr<float>("dropout_prob");
 
-    if (context.Attr<bool>("is_training")) {
+    if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
       int seed = context.Attr<int>("seed");
@@ -55,18 +55,19 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
     } else {
       auto X = EigenMatrix<T>::Reshape(*x, 1);
       auto Y = EigenMatrix<T>::Reshape(*y, 1);
-      auto place = context.GetEigenDevice<Place>();
-      Y.device(place) = X * dropout_prob;
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      Y.device(place) = X * (1.0f - dropout_prob);
     }
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class DropoutGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(context.Attr<bool>("is_training"),
-                   "GradOp is only callable when is_training is true");
+    PADDLE_ENFORCE(!context.Attr<bool>("is_test"),
+                   "GradOp is only callable when is_test is false");
 
     auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
     auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
@@ -77,7 +78,8 @@ class DropoutGradKernel : public framework::OpKernel<T> {
     auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
     auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
 
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     dX.device(place) = dY * M;
   }
 };
diff --git a/paddle/operators/dynamic_recurrent_op.cc b/paddle/operators/dynamic_recurrent_op.cc
deleted file mode 100644
index d48cc4e8df587708ab93e7d788145adc01c1d3e5..0000000000000000000000000000000000000000
--- a/paddle/operators/dynamic_recurrent_op.cc
+++ /dev/null
@@ -1,418 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve .
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/dynamic_recurrent_op.h"
-
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Scope;
-using framework::TensorArray;
-using framework::LoDTensor;
-using framework::Variable;
-using framework::OperatorBase;
-using framework::DySeqMetaBatch;
-
-namespace detail {
-
-inline void CreateVariables(Scope& scope,
-                            const std::vector<std::string>& var_names) {
-  for (const auto& name : var_names) {
-    scope.Var(name);
-  }
-}
-
-/*
- * The inputs with sequence should be reordered when they are split, so the
- * boot_states should be reordered in the same order.
- *
- * NOTE This may require that the `pre_state` of the first time step should just
- * copy the `boot_state` rather than reference it, for that the content should
- * be reordered, but the RNN op should not change the `boot_state` as an input
- * variable's content.
- */
-inline void ReorderInitialState(const DySeqMetaBatch& metas,
-                                const LoDTensor& boot_state, LoDTensor* tensor,
-                                const platform::Place& dst_place) {
-  for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
-    auto slice = tensor->Slice(seq_id, seq_id + 1);
-    auto boot_slice =
-        boot_state.Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
-    // TODO(superjom) pass in device context as an argument
-    slice.CopyFrom(boot_slice, dst_place, platform::CPUDeviceContext());
-  }
-}
-
-inline void RestoreInitialState(const DySeqMetaBatch& metas,
-                                const LoDTensor& tensor, LoDTensor* boot_state,
-                                const platform::Place& dst_place) {
-  for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
-    auto slice = tensor.Slice(seq_id, seq_id + 1);
-    auto boot_slice =
-        boot_state->Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
-    boot_slice.CopyFrom(slice, dst_place, platform::CPUDeviceContext());
-  }
-}
-
-}  // namespace detail
-
-// Implementation for forward propagation.
-template <>
-void RNNAlgorithm::Run<RNNAlgorithm::ComputeMode::kForward>(
-    const framework::Scope& scope, const framework::OperatorBase& op,
-    const platform::DeviceContext& dev_ctx) {
-  SetComputeMode(ComputeMode::kForward);
-  cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_);
-  SplitInputs();
-  CreateScopes();
-  WriteStepInputs();
-  InitStates();
-  WriteStepOutputs();
-  RunSteps();
-  ConcatOutputs();
-}
-
-// Implementation for backward propagation.
-template <>
-void RNNAlgorithm::Run<RNNAlgorithm::ComputeMode::kBackward>(
-    const framework::Scope& scope, const framework::OperatorBase& op,
-    const platform::DeviceContext& dev_ctx) {
-  SetComputeMode(ComputeMode::kBackward);
-  cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_);
-  SplitInputs();
-  WriteStepInputs();
-  InitStates();
-  WriteStepOutputs();
-  RunSteps();
-  // copy boot-states' gradients back.
-  for (const auto& state : arg_.states) {
-    ExportInitialStateGradient(state);
-  }
-
-  ConcatOutputs();
-}
-
-void RNNAlgorithm::SplitInputs() {
-  // TODO(superjom) make level a config
-  // TODO(superjom) check all the inputs has the same LoD
-  int level = 0;
-  for (const auto& item : cache_.inputs) {
-    const auto& var = item.second;
-    const auto& tensor = var->Get<LoDTensor>();
-    TensorArray& ta = step_inputs_[item.first];
-
-    dy_seq_metas_[item.first] =
-        ta.Unpack(tensor, level, true /*length_descend*/);
-
-    if (cache_.num_steps) {
-      PADDLE_ENFORCE_EQ(ta.size(), cache_.num_steps,
-                        "inputs should have the same steps");
-    } else {
-      cache_.num_steps = ta.size();
-    }
-  }
-}
-
-void RNNAlgorithm::WriteStepInputs() {
-  for (const auto& item : cache_.inputs) {
-    auto ta_it = step_inputs_.find(item.first);
-    PADDLE_ENFORCE(ta_it != step_inputs_.end(),
-                   "step_inputs_ not compatible with memory set");
-    TensorArray& ta = ta_it->second;
-    for (size_t step = 0; step < ta.size(); step++) {
-      auto tensor = ta.Read(step);
-      auto& step_scope = cache_.GetScope(step);
-      Variable* var = step_scope.FindVar(item.first);
-      if (var == nullptr) {
-        var = step_scope.Var(item.first);
-      }
-      var->GetMutable<LoDTensor>()->ShareDataWith(tensor);
-    }
-  }
-}
-
-void RNNAlgorithm::WriteStepOutputs() {
-  // initialize step outputs
-  for (const auto& item : cache_.outputs) {
-    step_outputs_.emplace(item.first, TensorArray());
-  }
-  PADDLE_ENFORCE_GT(step_outputs_.size(), 0UL);
-}
-
-void RNNAlgorithm::CreateScopes() {
-  PADDLE_ENFORCE_GT(cache_.num_steps, 0);
-  // resize scopes
-  size_t num_scopes_need_create = cache_.num_steps - cache_.scopes->size();
-  for (size_t i = 0; i < num_scopes_need_create; i++) {
-    cache_.scopes->emplace_back(&cache_.scope->NewScope());
-  }
-
-  // init temporary inputs
-  PADDLE_ENFORCE_NOT_NULL(step_unit_, "stepnet should be set first");
-  std::vector<std::string> states;
-  std::vector<std::string> ex_states;
-  std::vector<std::string> step_unit_outputs;
-  std::transform(arg_.states.begin(), arg_.states.end(),
-                 std::back_inserter(states),
-                 [](const rnn::StateAttr& m) { return m.var; });
-  std::transform(arg_.states.begin(), arg_.states.end(),
-                 std::back_inserter(ex_states),
-                 [](const rnn::StateAttr& m) { return m.pre_var; });
-  for (const auto& item : step_unit_->Outputs()) {
-    for (const auto& var : item.second) {
-      step_unit_outputs.push_back(var);
-    }
-  }
-
-  for (size_t step = 0; step < cache_.num_steps; step++) {
-    auto& scope = cache_.GetScope(step);
-    detail::CreateVariables(scope, arg_.inlinks);
-    detail::CreateVariables(scope, arg_.outlinks);
-    detail::CreateVariables(scope, states);
-    detail::CreateVariables(scope, ex_states);
-    detail::CreateVariables(scope, step_unit_outputs);
-  }
-}
-
-void RNNAlgorithm::ConcatOutputs() {
-  // TODO(superjom) transform this to a config
-  int level = 0;
-  for (size_t step = 0; step < cache_.num_steps; step++) {
-    auto& scope = cache_.GetScope(step);
-    for (auto& item : step_outputs_) {
-      auto* var = scope.FindVar(item.first);
-      PADDLE_ENFORCE_NOT_NULL(var);
-      auto* tensor = var->GetMutable<LoDTensor>();
-      tensor->mutable_data<value_type>(platform::CPUPlace());
-      item.second.WriteShared(step, *tensor);
-    }
-  }
-  // the inputs' lods should be the same, so randomly get one lod.
-  const auto& some_lod =
-      cache_.scope->FindVar(arg_.inlinks.front())->Get<LoDTensor>().lod();
-  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
-  for (auto& item : step_outputs_) {
-    auto tensor = item.second.Pack(level, some_meta, some_lod);
-    auto* output = cache_.outputs[item.first]->GetMutable<LoDTensor>();
-    const_cast<LoDTensor*>(output)->ShareDataWith(tensor);
-  }
-}
-
-void RNNAlgorithm::RunSteps() {
-  if (IsBackward()) {
-    // call stepnet in all the time steps reversely
-    for (int step = cache_.num_steps - 1; step >= 0; step--) {
-      auto& step_scope = cache_.GetScope(step);
-      step_unit_->Run(step_scope, *cache_.dev_ctx);
-    }
-  } else {
-    for (size_t step = 0; step < cache_.num_steps; step++) {
-      auto& step_scope = cache_.GetScope(step);
-      step_unit_->Run(step_scope, *cache_.dev_ctx);
-    }
-  }
-}
-
-void RNNAlgorithm::InitStates() {
-  for (size_t step = 0; step < cache_.num_steps; step++) {
-    for (const auto& state : arg_.states) {
-      CreateState(state, step);
-      LinkState(state, step);
-    }
-  }
-}
-
-void RNNAlgorithm::CreateState(const rnn::StateAttr& state_attr, size_t step) {
-  auto& scope = cache_.GetScope(step);
-  auto& state = *cache_.GetTensor(scope, state_attr.var);
-  auto& boot_state = *cache_.GetTensor(*cache_.scope, state_attr.boot_var);
-
-  size_t num_instances =
-      step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
-  auto dims = boot_state.dims();
-  dims[0] = num_instances;
-
-  state.Resize(dims);
-  state.mutable_data<value_type>(platform::CPUPlace());
-  states_[state_attr.var].WriteShared(step, state);
-}
-
-void RNNAlgorithm::LinkState(const rnn::StateAttr& state, size_t step) {
-  auto& scope = cache_.GetScope(step);
-  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
-
-  // process the first state's boot-state(the 0-step in forward mode or the
-  // last step in backward mode)
-  // Only forward mode need to link the boot-state to the `pre-state` in first
-  // time step. In backward mode, need to copy the gradient of `pre-state` in
-  // first time step to the gradient of `boot-state`.
-  if (step == 0 && IsForward()) {
-    LinkInitialState(state);
-  } else {
-    size_t num_instances =
-        step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
-    auto* pre_state = cache_.GetTensor(cache_.GetScope(step - 1), state.var);
-    // shink and share from previous state
-    auto shrinked_pre_state = pre_state->Slice(0, num_instances);
-    state_pre.ShareDataWith(shrinked_pre_state);
-  }
-}
-
-void RNNAlgorithm::LinkInitialState(const rnn::StateAttr& state) {
-  // all the step_inputs' metas should be the same, just randomly select one
-  // and get the dyseq meta.
-  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
-  auto& scope = cache_.GetScope(0);
-  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
-  auto* pre_state = cache_.GetTensor(*cache_.scope, state.boot_var);
-  pre_state->mutable_data<float>(platform::CPUPlace());
-  // allocate state
-  state_pre.Resize(pre_state->dims());
-  state_pre.mutable_data<value_type>(platform::CPUPlace());
-  detail::ReorderInitialState(some_meta, *pre_state, &state_pre,
-                              pre_state->place());
-}
-
-void RNNAlgorithm::ExportInitialStateGradient(const rnn::StateAttr& state) {
-  // all the step_inputs' metas should be the same, just randomly select one
-  // and get the dyseq meta.
-  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
-  auto& scope = cache_.GetScope(0);
-
-  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
-  auto& pre_state = *cache_.GetTensor(*cache_.scope, state.boot_var);
-  pre_state.Resize(state_pre.dims());
-  detail::RestoreInitialState(some_meta, state_pre, &pre_state,
-                              pre_state.place());
-}
-
-void RNNAlgorithm::ArgCache::Init(const rnn::ArgumentName& name,
-                                  const paddle::framework::OperatorBase& op,
-                                  const paddle::framework::Scope& scope,
-                                  platform::DeviceContext const* dev_ctx,
-                                  rnn::Argument* arg) {
-  this->scope = &scope;
-  InitArgument(name, op, arg);
-  CacheScopes(scope, *arg);
-  CacheInlinks(scope, arg->inlinks);
-  CacheOutlinks(scope, arg->outlinks);
-  this->dev_ctx = dev_ctx;
-}
-
-void RNNAlgorithm::ArgCache::InitArgument(const rnn::ArgumentName& name,
-                                          const OperatorBase& op,
-                                          rnn::Argument* arg) {
-  rnn::InitArgument(name, arg, op, false /*is_grad*/);
-}
-
-void RNNAlgorithm::ArgCache::CacheScopes(const Scope& scope,
-                                         const rnn::Argument& arg) {
-  auto scopes_var = scope.FindVar(arg.step_scopes);
-  PADDLE_ENFORCE(scopes_var != nullptr,
-                 "the step_scopes output argument [%s] should be created first "
-                 "by framework.",
-                 arg.step_scopes);
-  this->scopes = scopes_var->GetMutable<std::vector<Scope*>>();
-}
-
-void RNNAlgorithm::ArgCache::CacheInlinks(
-    const Scope& scope, const std::vector<std::string>& names) {
-  for (auto name : names) {
-    auto* var = GetVariable(scope, name);
-    inputs[name] = var;
-  }
-}
-
-void RNNAlgorithm::ArgCache::CacheOutlinks(
-    const Scope& scope, const std::vector<std::string>& names) {
-  for (auto name : names) {
-    auto* var = GetVariable(scope, name);
-    outputs[name] = var;
-  }
-}
-
-Variable* RNNAlgorithm::ArgCache::GetVariable(const Scope& scope,
-                                              const std::string& name) {
-  auto* var = scope.FindVar(name);
-  PADDLE_ENFORCE_NOT_NULL(var, "variable [%s] not exist in scope", name);
-  return var;
-}
-
-LoDTensor* RNNAlgorithm::ArgCache::GetTensor(const framework::Scope& scope,
-                                             const std::string& name) {
-  auto* var = GetVariable(scope, name);
-  return var->GetMutable<LoDTensor>();
-}
-
-const std::array<rnn::ArgumentName, 2> RNNAlgorithm::kArgNames{
-    {rnn::ArgumentName{"step_unit", "step_scopes", "inputs", "outputs",
-                       "states", "ex_states", "initial_states"},
-     rnn::ArgumentName{"step_unit", "step_scopes@GRAD", "outputs@GRAD",
-                       "inputs@GRAD", "states", "ex_states",
-                       "initial_states@GRAD"}}};
-
-void DynamicRecurrentOp::Run(const framework::Scope& scope,
-                             const platform::DeviceContext& dev_ctx) const {
-  rnn.Run<RNNAlgorithm::ComputeMode::kForward>(
-      scope, *dynamic_cast<const OperatorBase*>(this), dev_ctx);
-}
-
-void DynamicRecurrentGradientOp::Run(
-    const Scope& scope, const platform::DeviceContext& dev_ctx) const {
-  rnn.Run<RNNAlgorithm::ComputeMode::kBackward>(
-      scope, *dynamic_cast<const OperatorBase*>(this), dev_ctx);
-}
-
-class DynamicRecurrentOpProtoAndCheckerMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  DynamicRecurrentOpProtoAndCheckerMaker(framework::OpProto* proto,
-                                         framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    const auto& name =
-        RNNAlgorithm::kArgNames[RNNAlgorithm::ComputeMode::kForward];
-    // inputs and outputs stored in proto
-    AddInput(name.inlinks,
-             "The inputs that need to be segmented for each step.")
-        .AsDuplicable();
-    AddInput(name.initial_states, "Variables to initialize the states.")
-        .AsDuplicable();
-
-    AddOutput(name.outlinks,
-              "The outputs that need to be concatenated for all steps.")
-        .AsDuplicable();
-    AddOutput(name.step_scopes, "step scopes");
-
-    // Attributes stored in AttributeMap
-    AddAttr<std::vector<std::string>>(name.ex_states, "names of ex_states");
-    AddAttr<std::vector<std::string>>(name.states, "names of states");
-
-    AddComment(R"DOC(
-Dynamic Recurrent Operator.
-
-This is a RNN operator for varience-length sequences.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP(dynamic_recurrent, paddle::operators::DynamicRecurrentOp,
-            paddle::operators::DynamicRecurrentOpProtoAndCheckerMaker,
-            dynamic_recurrent_grad,
-            paddle::operators::DynamicRecurrentGradientOp);
diff --git a/paddle/operators/dynamic_recurrent_op.h b/paddle/operators/dynamic_recurrent_op.h
deleted file mode 100644
index 5b0548c3a44c9f58838ecc567ee41a587883c26a..0000000000000000000000000000000000000000
--- a/paddle/operators/dynamic_recurrent_op.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_TESTING
-#include "gtest/gtest.h"
-#endif
-
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/tensor_array.h"
-#include "paddle/framework/variable.h"
-#include "paddle/operators/rnn/recurrent_op_utils.h"
-
-namespace paddle {
-namespace operators {
-
-class RNNAlgorithm {
- public:
-  enum ComputeMode { kForward = 0, kBackward = 1 };
-  static const std::array<rnn::ArgumentName, 2> kArgNames;
-  using value_type = float;
-
-  /*
-   * Different `Run` method for forward and backward, `_` is just for template
-   * specifialization.
-   */
-  template <ComputeMode _>
-  void Run(const framework::Scope& scope, const framework::OperatorBase& op,
-           const platform::DeviceContext& dev_ctx);
-  /*
-   * Split the inputs(LoDTensors) to segments for each time step.
-   */
-  void SplitInputs();
-
-  /*
-   * Create step-scopes to store temporary outputs in each time steps.
-   */
-  void CreateScopes();
-
-  /*
-   * Link TensorArray steps to the corresponding variables located in
-   * step-scopes.
-   */
-  void WriteStepInputs();
-
-  /*
-   * Write output of each step to the corresponding TensorArray.
-   */
-  void WriteStepOutputs();
-
-  /*
-   * Initialize the states, each state will have a corresponding pre-state,
-   * which share the memory with the state in the previous time state. The
-   * pre-state in the first time step will be initialized with an zero tensor or
-   * a tensor in parent scope if is provided.
-   */
-  void InitStates();
-
-  /*
-   * Create state variables for each time step.
-   */
-  void CreateState(const rnn::StateAttr& state, size_t step);
-
-  /*
-   * Link pre-state variable in current scope to the state variable in the
-   * previous time step (scope) by reference.
-   */
-  void LinkState(const rnn::StateAttr& state, size_t step);
-
-  /*
-   * Link the pre-state of the first time step to the `boot-state` in parent's
-   * scope.
-   */
-  void LinkInitialState(const rnn::StateAttr& state);
-
-  /*
-   * Copy the gradient from `pre-state` in the first step-scope to the
-   * `boot-state` in parent's scope.
-   */
-  void ExportInitialStateGradient(const rnn::StateAttr& state);
-
-  /*
-   * Calculate time steps.
-   */
-  void RunSteps();
-
-  /*
-   * Concatenate outputs in each time step and generate a LoDTensor.
-   */
-  void ConcatOutputs();
-
-  void SetComputeMode(ComputeMode mode) { mode_ = mode; }
-  bool IsForward() const { return mode_ == ComputeMode::kForward; }
-  bool IsBackward() const { return mode_ == ComputeMode::kBackward; }
-
-  /*
-   * set a step unit that is created according to a RecurrentOp's step unit.
-   */
-  void SetStepUnit(std::unique_ptr<framework::OperatorBase> step_unit) {
-    PADDLE_ENFORCE_NOT_NULL(step_unit);
-    step_unit_ = std::move(step_unit);
-  }
-  const framework::OperatorBase& GetStepUnit() const { return *step_unit_; }
-
-  const framework::TensorArray& state(const std::string& name) const {
-    auto it = states_.find(name);
-    PADDLE_ENFORCE(it != states_.end());
-    return it->second;
-  }
-  const framework::TensorArray& step_input(const std::string& name) const {
-    auto it = step_inputs_.find(name);
-    PADDLE_ENFORCE(it != step_inputs_.end());
-    return it->second;
-  }
-  const framework::TensorArray& step_output(const std::string& name) const {
-    auto it = step_outputs_.find(name);
-    PADDLE_ENFORCE(it != step_outputs_.end());
-    return it->second;
-  }
-
- protected:
-  struct ArgCache {
-    framework::Scope const* scope;
-    std::vector<framework::Scope*>* scopes;
-    std::map<std::string, framework::Variable*> inputs;
-    std::map<std::string, framework::Variable*> outputs;
-    platform::DeviceContext const* dev_ctx;
-
-    size_t num_steps{0};
-
-    void Init(const rnn::ArgumentName& name, const framework::OperatorBase& op,
-              const framework::Scope& scope,
-              platform::DeviceContext const* dev_ctx, rnn::Argument* arg);
-
-    framework::Scope& GetScope(size_t index) {
-      PADDLE_ENFORCE_LT(index, num_steps);
-      return *scopes->at(index);
-    }
-
-    framework::LoDTensor* GetTensor(const framework::Scope& scope,
-                                    const std::string& name);
-
-   private:
-    void InitArgument(const rnn::ArgumentName& name,
-                      const framework::OperatorBase& op, rnn::Argument* arg);
-    void CacheScopes(const framework::Scope& scope, const rnn::Argument& arg);
-    void CacheInlinks(const framework::Scope& scope,
-                      const std::vector<std::string>& names);
-    void CacheOutlinks(const framework::Scope& scope,
-                       const std::vector<std::string>& names);
-    framework::Variable* GetVariable(const framework::Scope& scope,
-                                     const std::string& name);
-  };
-
- private:
-  std::unique_ptr<framework::OperatorBase> step_unit_;
-  std::map<std::string, framework::TensorArray> states_;
-  std::map<std::string, framework::TensorArray> step_inputs_;
-  std::map<std::string, framework::TensorArray> step_outputs_;
-  std::map<std::string, std::vector<framework::DySeqMeta>> dy_seq_metas_;
-  rnn::Argument arg_;
-  ArgCache cache_;
-  ComputeMode mode_{ComputeMode::kForward};
-
-#ifdef PADDLE_WITH_TESTING
-  // test forward
-  friend class RNNAlgorithmTestHelper;
-  FRIEND_TEST(RNNAlgorithmTestHelper, SplitInputs);
-  FRIEND_TEST(RNNAlgorithmTestHelper, CreateCache);
-  FRIEND_TEST(RNNAlgorithmTestHelper, CreateScopes);
-  FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepInputs);
-  FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepOutputs);
-  FRIEND_TEST(RNNAlgorithmTestHelper, InitStates);
-  FRIEND_TEST(RNNAlgorithmTestHelper, ConcatOutputs);
-// TODO(superjom) test backward
-#endif
-};
-
-class DynamicRecurrentOp : public framework::OperatorBase {
- public:
-  DynamicRecurrentOp(const std::string& type,
-                     const framework::VariableNameMap& inputs,
-                     const framework::VariableNameMap& outputs,
-                     const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  DynamicRecurrentOp(const DynamicRecurrentOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    PADDLE_THROW("Not implemented");
-  }
-
-  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override;
-
-  mutable RNNAlgorithm rnn;
-};
-
-class DynamicRecurrentGradientOp : public framework::OperatorBase {
- public:
-  DynamicRecurrentGradientOp(const std::string& type,
-                             const framework::VariableNameMap& inputs,
-                             const framework::VariableNameMap& outputs,
-                             const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  DynamicRecurrentGradientOp(const DynamicRecurrentGradientOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    PADDLE_THROW("Not implemented");
-  }
-
-  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override;
-
-  mutable RNNAlgorithm rnn;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/dynamic_recurrent_op_test.cc b/paddle/operators/dynamic_recurrent_op_test.cc
deleted file mode 100644
index 8d840e259b190ead86a66df8ab31c5170db4d824..0000000000000000000000000000000000000000
--- a/paddle/operators/dynamic_recurrent_op_test.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-#include "paddle/operators/dynamic_recurrent_op.h"
-
-#include <gtest/gtest.h>
-
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/net_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Scope;
-using framework::TensorArray;
-using framework::LoDTensor;
-using framework::Variable;
-
-class TestOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  DEFINE_OP_CLONE_METHOD(TestOp);
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
-};
-
-void OpDescNewVar(const std::string& param_name,
-                  std::initializer_list<const char*> arguments,
-                  paddle::framework::OpDesc::Var* var) {
-  var->set_parameter(param_name);
-  for (auto& arg_name : arguments) {
-    var->add_arguments(arg_name);
-  }
-}
-
-// create a LoD tensor in scope with specific dims
-LoDTensor* CreateVar(Scope& scope, std::string name, framework::DDim dims,
-                     const platform::Place& place) {
-  auto* var = scope.Var(name);
-  auto* tensor = var->GetMutable<LoDTensor>();
-  tensor->Resize(dims);
-  tensor->mutable_data<float>(place);
-  return tensor;
-}
-
-class RNNAlgorithmTestHelper : public ::testing::Test {
- protected:
-  const rnn::ArgumentName argname = RNNAlgorithm::kArgNames[0];
-
-  virtual void SetUp() override {
-    CreateGlobalVariables();
-
-    auto op_desc = CreateOpDesc();
-    op = paddle::framework::OpRegistry::CreateOp(op_desc);
-    dop = &(dynamic_cast<DynamicRecurrentOp*>(op.get())->rnn);
-    InitCacheManually();
-    InitStepNet();
-  }
-
-  framework::OpDesc CreateOpDesc() {
-    // create op
-    paddle::framework::OpDesc op_desc;
-    op_desc.set_type("dynamic_recurrent");
-
-    OpDescNewVar(argname.inlinks, {"in0"}, op_desc.add_inputs());
-    OpDescNewVar(argname.initial_states, {"boot_mem"}, op_desc.add_inputs());
-    OpDescNewVar(argname.step_scopes, {"step_scopes"}, op_desc.add_outputs());
-    OpDescNewVar(argname.outlinks, {"out0"}, op_desc.add_outputs());
-
-    // set pre-states
-    auto pre_memories = op_desc.mutable_attrs()->Add();
-    pre_memories->set_name(argname.ex_states);
-    pre_memories->set_type(paddle::framework::AttrType::STRINGS);
-    auto pre_memories_item = pre_memories->add_strings();
-    *pre_memories_item = "mem@pre";
-
-    // set states
-    auto memories = op_desc.mutable_attrs()->Add();
-    memories->set_name(argname.states);
-    memories->set_type(paddle::framework::AttrType::STRINGS);
-    auto memories_item = memories->add_strings();
-    *memories_item = "mem";
-    return op_desc;
-  }
-
-  void CreateGlobalVariables() {
-    platform::CPUPlace place;
-    scope.Var("step_scopes");
-    CreateVar(scope, "boot_mem", framework::make_ddim({10, 20}), place);
-    CreateVar(scope, "out0", framework::make_ddim({10, 20}), place);
-    auto* in0 = CreateVar(scope, "in0", framework::make_ddim({10, 8}), place);
-    // 10 instanes with 4 sentences, length is 4, 3, 2, 1 respectively.
-    framework::LoD in0_lod(1);
-    for (int x : std::vector<int>{0, 4, 7, 9, 10}) {
-      in0_lod[0].push_back(x);
-    }
-    in0->set_lod(in0_lod);
-    in0->Resize(framework::make_ddim({10, 8}));
-    // set the content, each sentence content is seqid.batchid
-    // the seqid starts from 0
-    int start = 0;
-    for (size_t seqid = 0; seqid < in0_lod.size() - 1; seqid++) {
-      for (size_t batchid = 0;
-           batchid < in0_lod[0][seqid + 1] - in0_lod[0][seqid]; batchid++) {
-        float v = seqid + batchid * 0.1;
-
-        for (size_t dim = 0; dim < 8; dim++) {
-          in0->data<float>()[start * 8 + dim] = v;
-        }
-        start++;
-      }
-    }
-  }
-
-  void InitCacheManually() {
-    dop->cache_.Init(RNNAlgorithm::kArgNames[0], *op, scope, &device_context,
-                     &dop->arg_);
-  }
-
-  void InitStepNet() {
-    std::unique_ptr<framework::OperatorBase> stepnet{new NetOp};
-    dynamic_cast<NetOp*>(stepnet.get())
-        ->AppendOp(std::unique_ptr<TestOp>(new TestOp(
-            "test", {{"inputs", {"in0"}}, {"initial_states", {"boot_mem"}}},
-            {{"outputs", {"out0"}}, {"step_scopes", {"step_scopes"}}}, {})));
-    dop->SetStepUnit(std::move(stepnet));
-  }
-
- protected:
-  RNNAlgorithm* dop;
-  std::unique_ptr<framework::OperatorBase> op;
-  paddle::platform::CPUDeviceContext device_context;
-  paddle::framework::Scope scope;
-};
-
-TEST_F(RNNAlgorithmTestHelper, CreateCache) {
-  const rnn::Argument& arg = dop->arg_;
-  ASSERT_EQ(arg.inlinks.size(), 1UL);
-  ASSERT_EQ(arg.outlinks.size(), 1UL);
-}
-
-TEST_F(RNNAlgorithmTestHelper, SplitInputs) {
-  dop->SplitInputs();
-  auto& in0_ta = dop->step_inputs_["in0"];
-  ASSERT_EQ(in0_ta.size(), 4UL);
-
-  const auto& batch0 = in0_ta.Read(0);
-  const auto& batch1 = in0_ta.Read(1);
-  const auto& batch2 = in0_ta.Read(2);
-  const auto& batch3 = in0_ta.Read(3);
-  EXPECT_EQ(batch0.dims()[0], 4);
-  EXPECT_EQ(batch1.dims()[0], 3);
-  EXPECT_EQ(batch2.dims()[0], 2);
-  EXPECT_EQ(batch3.dims()[0], 1);
-}
-
-TEST_F(RNNAlgorithmTestHelper, CreateScopes) {
-  dop->SplitInputs();
-  dop->CreateScopes();
-  ASSERT_EQ(dop->cache_.num_steps, 4UL);
-  ASSERT_EQ(dop->cache_.scopes->size(), 4UL);
-}
-
-TEST_F(RNNAlgorithmTestHelper, WriteStepInputs) {
-  dop->SplitInputs();
-  dop->CreateScopes();
-  dop->WriteStepInputs();
-
-  for (size_t step = 0; step < dop->cache_.num_steps; step++) {
-    auto& scope = dop->cache_.GetScope(step);
-    for (auto name : std::vector<std::string>({"in0"})) {
-      ASSERT_TRUE(scope.FindVar(name) != nullptr);
-    }
-  }
-}
-
-TEST_F(RNNAlgorithmTestHelper, WriteStepOutputs) {
-  dop->SplitInputs();
-  dop->CreateScopes();
-  dop->WriteStepInputs();
-  dop->WriteStepOutputs();
-
-  for (size_t step = 0; step < dop->cache_.num_steps; step++) {
-    auto& scope = dop->cache_.GetScope(step);
-    for (auto name : std::vector<std::string>({"out0"})) {
-      ASSERT_TRUE(scope.FindVar(name));
-    }
-  }
-}
-
-TEST_F(RNNAlgorithmTestHelper, ConcatOutputs) {
-  // Let's leave this test to python unittest.
-}
-
-TEST_F(RNNAlgorithmTestHelper, InitStates) {
-  dop->SetComputeMode(RNNAlgorithm::ComputeMode::kForward);
-  dop->SplitInputs();
-  dop->CreateScopes();
-  dop->WriteStepInputs();
-  dop->WriteStepOutputs();
-  dop->InitStates();
-
-  for (size_t step = 0; step < dop->cache_.num_steps; step++) {
-    auto& scope = dop->cache_.GetScope(step);
-    auto state = scope.FindVar("mem");
-    ASSERT_TRUE(state != nullptr);
-
-    auto* pre_state = scope.FindVar("mem@pre");
-    ASSERT_TRUE(pre_state != nullptr);
-
-    auto* boot_state = scope.FindVar("boot_mem");
-    ASSERT_TRUE(boot_state != nullptr);
-  }
-}
-
-}  // operators
-}  // namespace paddle
diff --git a/paddle/operators/edit_distance_op.cc b/paddle/operators/edit_distance_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e383f07fa9b53a3def10f6405a0d36f48f52ff08
--- /dev/null
+++ b/paddle/operators/edit_distance_op.cc
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/edit_distance_op.h"
+
+namespace paddle {
+namespace operators {
+
+class EditDistanceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Hyps"), "Input(Hyps) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Refs"), "Input(Refs) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null.");
+    auto hyp_dims = ctx->GetInputDim("Hyps");
+    auto ref_dims = ctx->GetInputDim("Refs");
+    PADDLE_ENFORCE(hyp_dims.size() == 2 && hyp_dims[1] == 1,
+                   "Input(Hyps) must be a 2-D LoDTensor with the 2nd dimension "
+                   "equal to 1.");
+    PADDLE_ENFORCE(ref_dims.size() == 2 && ref_dims[1] == 1,
+                   "Input(Refs) must be a 2-D LoDTensor with the 2nd dimension "
+                   "equal to 1.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("Refs"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(framework::proto::DataType::FP32,
+                                   ctx.device_context());
+  }
+};
+
+class EditDistanceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  EditDistanceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Hyps",
+             "(2-D LoDTensor<int>, 2nd dim. equal to 1) "
+             "The indices for hypothesis strings.");
+    AddInput("Refs",
+             "(2-D LoDTensor<int>, 2nd dim. equal to 1) "
+             "The indices for reference strings.");
+    AddAttr<bool>("normalized",
+                  "(bool, default false) Indicated whether to normalize "
+                  "the edit distance by the length of reference string.")
+        .SetDefault(false);
+    AddOutput("Out",
+              "(2-D Tensor with shape [`batch_size` x 1]) "
+              "The output edit distances of EditDistance operator.");
+    AddComment(R"DOC(
+
+EditDistance operator computes the edit distances between a batch of hypothesis
+strings and their references.
+
+Edit distance, also called Levenshtein distance, measures how dissimilar two strings 
+are by counting the minimum number of operations to transform one string into anthor. 
+Here the operations include insertion, deletion, and substitution. For example, 
+given hypothesis string A = "kitten" and reference B = "sitting", the edit distance 
+is 3 for A will be transformed into B at least after two substitutions and one 
+insertion:
+  
+   "kitten" -> "sitten" -> "sittin" -> "sitting"
+
+Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total 
+number denoted by `batch_size`, and the separation is specified by the LoD information. 
+And the `batch_size` reference strings are arranged in order in the same way in the 
+LoDTensor Input(Refs).
+
+Output(Out) contains the `batch_size` results and each stands for the edit stance 
+for a pair of strings respectively. If Attr(normalized) is true, the edit distance 
+will be divided by the length of reference string.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(edit_distance, ops::EditDistanceOp, ops::EditDistanceOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    edit_distance, ops::EditDistanceKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/edit_distance_op.cu b/paddle/operators/edit_distance_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cf5ebc5c38fd006d10de790e45e9bff3409bd20c
--- /dev/null
+++ b/paddle/operators/edit_distance_op.cu
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/cuda_helper.h"
+#include "paddle/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T>
+__global__ void FillFirstRow(T* dist, const int N) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx < N + 1) {
+    dist[idx] = idx;
+  }
+}
+
+template <typename T>
+__global__ void FillFirstColumn(T* dist, const int M, const int N) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx < M + 1) {
+    dist[idx * (N + 1)] = idx;
+  }
+}
+
+template <typename T>
+__global__ void Levenshtein(T* dist, const int* x1, const int* x2, const int M,
+                            const int N, const int start) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int offset = N;
+  int index = start + idx * offset;
+  int row = index / (N + 1);
+  int col = index % (N + 1);
+  if (row > 0 && col > 0 && row < M + 1 && col < N + 1) {
+    int cost = x1[row - 1] == x2[col - 1] ? 0 : 1;
+    int dels = dist[(row - 1) * (N + 1) + col] + 1;
+    int ins = dist[row * (N + 1) + col - 1] + 1;
+    int subs = dist[(row - 1) * (N + 1) + (col - 1)] + cost;
+    dist[index] = min(dels, min(ins, subs));
+  }
+}
+
+template <typename T>
+__global__ void SetOutput(T* out, const T* dist, const int M, const int N,
+                          bool normalized) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx == 0) {
+    out[0] = normalized ? dist[M * (N + 1) + N] / N : dist[M * (N + 1) + N];
+  }
+}
+
+template <typename Place, typename T>
+class EditDistanceGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::Tensor>("Out");
+
+    auto* x1_t = ctx.Input<framework::LoDTensor>("Hyps");
+    auto* x2_t = ctx.Input<framework::LoDTensor>("Refs");
+
+    auto normalized = ctx.Attr<bool>("normalized");
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+
+    auto hyp_lod = x1_t->lod()[0];
+    auto ref_lod = x2_t->lod()[0];
+    PADDLE_ENFORCE(
+        hyp_lod.size() == ref_lod.size(),
+        "Input(Hyps) and Input(Refs) must have the same batch size.");
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1],
+                     "Reference string %d is empty.", i);
+    }
+
+    auto num_strs = hyp_lod.size() - 1;
+    out_t->Resize({static_cast<int64_t>(num_strs), 1});
+    out_t->mutable_data<T>(ctx.GetPlace());
+    auto out = out_t->data<T>();
+
+    T distance = 0.0;
+    for (size_t num = 0; num < num_strs; num++) {
+      auto m = static_cast<int64_t>(hyp_lod[num + 1] - hyp_lod[num]);
+      auto n = static_cast<int64_t>(ref_lod[num + 1] - ref_lod[num]);
+      if (m == 0 || n == 0) {
+        distance = std::max(m, n);
+        if (normalized) {
+          PADDLE_ENFORCE(n > 0,
+                         "The reference string (#%d) cannot be empty "
+                         "when Attr(normalized) is enabled.",
+                         n);
+          distance = distance / n;
+        }
+        memory::Copy(boost::get<Place>(ctx.GetPlace()), out + num,
+                     platform::CPUPlace(), &distance, sizeof(T), stream);
+      } else {
+        framework::Tensor dist_t;
+        dist_t.Resize({m + 1, n + 1});
+        dist_t.mutable_data<T>(ctx.GetPlace());
+        auto dist = dist_t.data<T>();
+        auto x1 = x1_t->data<int>() + hyp_lod[num];
+        auto x2 = x2_t->data<int>() + ref_lod[num];
+
+        FillFirstColumn<T><<<1 + m / PADDLE_CUDA_NUM_THREADS,
+                             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, m, n);
+
+        FillFirstRow<T><<<1 + n / PADDLE_CUDA_NUM_THREADS,
+                          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, n);
+        // Compute the elements of distance matrix in the anti-diagonal diretion
+        for (int64_t slice = 2; slice < m + n + 1; ++slice) {
+          int z_m = slice < m + 1 ? 0 : slice - m;
+          int z_n = slice < n + 1 ? 0 : slice - n;
+          int size = slice - (z_m + z_n) + 1;  // number of elments in the same
+                                               // anti-diagonal line to update
+          // the start index at which computes from
+          int start = slice < n + 1 ? slice : (z_n + 1) * (n + 1) - 1;
+          Levenshtein<T><<<1 + (size - 1) / PADDLE_CUDA_NUM_THREADS,
+                           PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, x1, x2,
+                                                                 m, n, start);
+        }
+        SetOutput<T><<<1, 1, 0, stream>>>(out + num, dist, m, n, normalized);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    edit_distance,
+    ops::EditDistanceGPUKernel<paddle::platform::CUDAPlace, float>);
diff --git a/paddle/operators/edit_distance_op.h b/paddle/operators/edit_distance_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..537e70281a5a750db480468a8f8e3c0465de6c5a
--- /dev/null
+++ b/paddle/operators/edit_distance_op.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class EditDistanceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::Tensor>("Out");
+
+    auto* x1_t = ctx.Input<framework::LoDTensor>("Hyps");
+    auto* x2_t = ctx.Input<framework::LoDTensor>("Refs");
+
+    auto normalized = ctx.Attr<bool>("normalized");
+
+    auto hyp_lod = x1_t->lod()[0];
+    auto ref_lod = x2_t->lod()[0];
+    PADDLE_ENFORCE(
+        hyp_lod.size() == ref_lod.size(),
+        "Input(Hyps) and Input(Refs) must have the same batch size.");
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1],
+                     "Reference string %d is empty.", i);
+    }
+    auto num_strs = hyp_lod.size() - 1;
+
+    out_t->Resize({static_cast<int64_t>(num_strs), 1});
+    out_t->mutable_data<float>(ctx.GetPlace());
+    auto out = out_t->data<T>();
+
+    T distance = 0.0;
+    for (size_t num = 0; num < num_strs; ++num) {
+      auto m = static_cast<int64_t>(hyp_lod[num + 1] - hyp_lod[num]);
+      auto n = static_cast<int64_t>(ref_lod[num + 1] - ref_lod[num]);
+
+      if (m == 0) {
+        distance = n;
+      } else if (n == 0) {
+        distance = m;
+      } else {
+        framework::Tensor dist_t;
+        dist_t.Resize({m + 1, n + 1});
+        dist_t.mutable_data<T>(ctx.GetPlace());
+        auto dist = dist_t.data<T>();
+        auto x1 = x1_t->data<int>() + hyp_lod[num];
+        auto x2 = x2_t->data<int>() + ref_lod[num];
+        for (int64_t i = 0; i < m + 1; ++i) {
+          dist[i * (n + 1)] = i;
+        }
+        for (int64_t j = 0; j < n + 1; ++j) {
+          dist[j] = j;
+        }
+        for (int64_t i = 1; i < m + 1; ++i) {
+          for (int64_t j = 1; j < n + 1; ++j) {
+            int cost = x1[i - 1] == x2[j - 1] ? 0 : 1;
+            int dels = dist[(i - 1) * (n + 1) + j] + 1;
+            int ins = dist[i * (n + 1) + (j - 1)] + 1;
+            int subs = dist[(i - 1) * (n + 1) + (j - 1)] + cost;
+            dist[i * (n + 1) + j] = std::min(dels, std::min(ins, subs));
+          }
+        }
+        distance = dist[m * (n + 1) + n];
+      }
+
+      if (normalized) {
+        PADDLE_ENFORCE(n > 0,
+                       "The reference string (#%d) cannot be empty "
+                       "when Attr(normalized) is enabled.",
+                       n);
+        distance = distance / n;
+      }
+      out[num] = distance;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/operators/elementwise_add_op.cc
index 432b9ba6f72f8dd11c666d5473c570bde60de995..37951fa7587c8a200f4733e5a46575461f5026cd 100644
--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/operators/elementwise_add_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/elementwise_add_op.h"
 #include "paddle/operators/elementwise_op.h"
@@ -19,10 +19,9 @@ namespace paddle {
 namespace operators {
 class ElementwiseAddOpMaker : public ElementwiseOpMaker {
  public:
-  ElementwiseAddOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseAddOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Add", "$Out = X + Y$");
+    SetComment("Add", "Out = X + Y");
     AddComment(comment_);
   }
 };
@@ -34,13 +33,13 @@ REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker,
             elementwise_add_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_add,
-    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_add_op.cu b/paddle/operators/elementwise_add_op.cu
index 7591428ac7c2f74f25f0f7d818eafcf59c8e4a4f..641cea323acee549898cb6f0245ccac4c069ce32 100644
--- a/paddle/operators/elementwise_add_op.cu
+++ b/paddle/operators/elementwise_add_op.cu
@@ -1,31 +1,32 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/elementwise_add_op.h"
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     elementwise_add,
-    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, int64_t>);
-REGISTER_OP_GPU_KERNEL(
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
     elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, int64_t>);
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h
index f04fe3ec6069ab1bf227be6a3a5c10ee908e4824..6478e1e0c2e1cfc8a1be5e8842113ec8ca33d762 100644
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -19,33 +19,53 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename DeviceContext, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseCompute<EigenAddFunctor, Place, T>(ctx);
-  }
-};
+    using Tensor = framework::Tensor;
 
-template <typename T>
-struct ElementwiseAddGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e;
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
+        x, y, z, ctx.template device_context<DeviceContext>(), AddFunctor<T>());
+
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                      "Rank of first input must >= rank of second input.");
+
+    if (x_dims == y_dims) {
+      functor.Run();
+      return;
     }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = dz_e;
+
+    int axis = ctx.Attr<int>("axis");
+    axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+    PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                   "Axis should be in range [0, x_dims)");
+
+    int pre, n, post;
+    get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+    if (post == 1) {
+      functor.RunRowWise(n, pre);
+      return;
+    } else {
+      functor.RunMidWise(n, pre, post);
+      return;
     }
   }
 };
 
 template <typename T>
-struct ElementwiseAddOneGradFunctor {
+struct ElementwiseAddGradFunctor {
   template <typename Device, typename X, typename Y, typename Z, typename dX,
             typename dY, typename dZ>
   void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
@@ -56,7 +76,7 @@ struct ElementwiseAddOneGradFunctor {
     }
     if (dy) {
       auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = dz_e.sum();
+      dy_e.device(d) = dz_e;
     }
   }
 };
@@ -100,12 +120,11 @@ struct ElementwiseAddBroadCast2GradFunctor {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseAddGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseGradCompute<Place, T, ElementwiseAddGradFunctor<T>,
-                           ElementwiseAddOneGradFunctor<T>,
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseAddGradFunctor<T>,
                            ElementwiseAddBroadCastGradFunctor<T>,
                            ElementwiseAddBroadCast2GradFunctor<T>>(ctx);
   }
diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/operators/elementwise_div_op.cc
index 7a325199bd07e44042a4e8b3aae0ab93fae1c351..6ebd58b1b3dd79a465e70a24f7aab56261290bf6 100644
--- a/paddle/operators/elementwise_div_op.cc
+++ b/paddle/operators/elementwise_div_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/elementwise_div_op.h"
 #include "paddle/operators/elementwise_op.h"
@@ -19,10 +19,9 @@ namespace paddle {
 namespace operators {
 class ElementwiseDivOpMaker : public ElementwiseOpMaker {
  public:
-  ElementwiseDivOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseDivOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Div", "$Out = X / Y$");
+    SetComment("Div", "Out = X / Y");
     AddComment(comment_);
   }
 };
@@ -35,13 +34,13 @@ REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
             elementwise_div_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_div_op.cu b/paddle/operators/elementwise_div_op.cu
index de4d0c33442a1fcfe0dd4c16df7ceeec737fbc6d..a0372123d6ffe7e3a90727ddd37a787c1030e0bd 100644
--- a/paddle/operators/elementwise_div_op.cu
+++ b/paddle/operators/elementwise_div_op.cu
@@ -1,31 +1,32 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/elementwise_div_op.h"
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, int64_t>);
-REGISTER_OP_GPU_KERNEL(
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
     elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, int64_t>);
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h
index 8946ff3d25c2aff3dc3aa69368f0083371cd2fef..7783875e24ef0d94e50f49b3598c93ff8bd5d73f 100644
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -19,11 +19,11 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseDivKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseCompute<EigenDivFunctor, Place, T>(ctx);
+    ElementwiseCompute<EigenDivFunctor, DeviceContext, T>(ctx);
   }
 };
 
@@ -102,12 +102,11 @@ struct ElementwiseDivBroadCast2GradFunctor {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseDivGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseGradCompute<Place, T, ElementwiseDivGradFunctor<T>,
-                           ElementwiseDivGradFunctor<T>,
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>,
                            ElementwiseDivBroadCastGradFunctor<T>,
                            ElementwiseDivBroadCast2GradFunctor<T>>(ctx);
   }
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index 8851267a524f51773a9f86ff83943cea4cb042aa..450dd05c796e22794315274b73398e85c8145940 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/elementwise_mul_op.h"
 #include "paddle/operators/elementwise_op.h"
@@ -20,10 +20,9 @@ namespace operators {
 
 class ElementwiseMulOpMaker : public ElementwiseOpMaker {
  public:
-  ElementwiseMulOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Mul", "$Out = X \\odot\\ Y$");
+    SetComment("Mul", "Out = X \\odot\\ Y");
     AddComment(comment_);
   }
 };
@@ -36,13 +35,13 @@ REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
             elementwise_mul_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_mul_op.cu b/paddle/operators/elementwise_mul_op.cu
index b0dfdee1ccef56c6cda06ae6759017294fa5115c..f73e8afda960a89aff8568eab66b0f120db2e342 100644
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
@@ -1,31 +1,32 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/elementwise_mul_op.h"
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, int64_t>);
-REGISTER_OP_GPU_KERNEL(
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
     elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, int64_t>);
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h
index 4469b07eaa08a3b011a88e58f1d645dd30b10ced..0e6559eacc0f90fec78397ce66372071370291e5 100644
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/operators/elementwise_op_function.h"
@@ -18,11 +18,11 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseMulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseCompute<EigenMulFunctor, Place, T>(ctx);
+    ElementwiseCompute<EigenMulFunctor, DeviceContext, T>(ctx);
   }
 };
 
@@ -101,12 +101,11 @@ struct ElementwiseMulBroadCast2GradFunctor {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseMulGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseGradCompute<Place, T, ElementwiseMulGradFunctor<T>,
-                           ElementwiseMulGradFunctor<T>,
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>,
                            ElementwiseMulBroadCastGradFunctor<T>,
                            ElementwiseMulBroadCast2GradFunctor<T>>(ctx);
   }
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
index 56e5eb69bc382a2c15d88b759fa6987f02c6cabb..a342595b546bfca1a344cf8a549597df6a29adec 100644
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -35,7 +35,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
     auto x_dim = ctx->GetInputDim("X");
     auto y_dim = ctx->GetInputDim("Y");
     PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
-                      "Rank of first input must >= rank of second input.")
+                      "Rank of first input must >= rank of second input.");
     ctx->SetOutputDim("Out", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
@@ -43,8 +43,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
 
 class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ElementwiseOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The first input tensor of elementwise op");
     AddInput("Y", "(Tensor) The second input tensor of elementwise op");
@@ -59,7 +58,8 @@ Limited Elementwise {name} Operator.
 
 The equation is:
 
-{equation}
+.. math::
+  {equation}
 
 X is a tensor of any dimension and the dimensions of tensor Y must be smaller than
 or equal to the dimensions of X. 
@@ -72,15 +72,16 @@ For case 2:
 Y will be broadcasted to match the shape of X and axis should be 
 the starting dimension index for broadcasting Y onto X.
 
-example:
-  shape(X) = (2, 3, 4, 5), shape(Y) = (,)
-  shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-  shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
-  shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
-  shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+For example
+  .. code-block:: python
 
-Both the input X and Y can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input X.
+    shape(X) = (2, 3, 4, 5), shape(Y) = (,)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+    shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+
+Either of the inputs X and Y or none can carry the LoD (Level of Details) information. However, the output only shares the LoD information with input X.
 
 )DOC";
     AddComment(comment_);
@@ -120,7 +121,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
     PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                      "Rank of first input must >= rank of second input.")
+                      "Rank of first input must >= rank of second input.");
 
     auto x_grad_name = framework::GradVarName("X");
     auto y_grad_name = framework::GradVarName("Y");
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h
index 488a35aafc8600bb8bb252fc3a5161c72a2f6df1..0c75276b03140473cee4b57a4022ff3c6989ab4c 100644
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -1,21 +1,26 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
+#include "paddle/platform/transform.h"
+
+#ifdef __NVCC__
+#include <thrust/iterator/iterator_adaptor.h>
+#endif
 
 #include "paddle/operators/math/math_function.h"
 
@@ -54,18 +59,175 @@ inline void get_mid_dims(const framework::DDim& x_dims,
   }
 }
 
+template <typename T, typename DeviceContext>
+class RowwiseTransformIterator;
+template <typename T, typename DeviceContext>
+class MidWiseTransformIterator;
+
+template <typename T>
+class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
+ public:
+  RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
+
+  RowwiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
+    ++i_;
+    if (UNLIKELY(i_ == n_)) {
+      i_ = 0;
+    }
+    return *this;
+  }
+
+  bool operator==(const RowwiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
+    return (ptr_ + i_) == &(*rhs);
+  }
+
+  bool operator!=(const RowwiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
+    return (ptr_ + i_) != &(*rhs);
+  }
+
+  const T& operator*() { return ptr_[i_]; }
+
+ private:
+  const T* ptr_;
+  int i_;
+  int64_t n_;
+};
+
+template <typename T>
+class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
+ public:
+  MidWiseTransformIterator(const T* ptr, int n, int post)
+      : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
+
+  MidWiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
+    ++j_;
+    if (UNLIKELY(j_ == post_)) {
+      ++i_;
+      j_ = 0;
+      if (UNLIKELY(i_ == n_)) {
+        i_ = 0;
+      }
+    }
+    return *this;
+  }
+
+  bool operator==(const MidWiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
+    return (ptr_ + i_) == &(*rhs);
+  }
+
+  bool operator!=(const MidWiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
+    return (ptr_ + i_) != &(*rhs);
+  }
+
+  const T& operator*() { return ptr_[i_]; }
+
+ private:
+  const T* ptr_;
+  int64_t i_;
+  int64_t j_;
+  int64_t n_;
+  int64_t post_;
+};
+
+#ifdef __NVCC__
+template <typename T>
+class RowwiseTransformIterator<T, platform::CUDADeviceContext>
+    : public thrust::iterator_adaptor<
+          RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*> {
+ public:
+  typedef thrust::iterator_adaptor<
+      RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
+      super_t;
+  HOSTDEVICE RowwiseTransformIterator(const T* x, int n)
+      : super_t(x), begin_(x), n_(n){};
+  friend class thrust::iterator_core_access;
+
+ private:
+  unsigned int n_;
+  const T* begin_;
+  HOSTDEVICE typename super_t::reference dereference() const {
+    return *(begin_ + (this->base() - begin_) % n_);
+  }
+};
+
+template <typename T>
+class MidWiseTransformIterator<T, platform::CUDADeviceContext>
+    : public thrust::iterator_adaptor<
+          MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*> {
+ public:
+  typedef thrust::iterator_adaptor<
+      MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
+      super_t;
+  HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post)
+      : super_t(x), begin_(x), n_(n), post_(post){};
+  friend class thrust::iterator_core_access;
+
+ private:
+  unsigned int post_;
+  unsigned int n_;
+  const T* begin_;
+  HOSTDEVICE typename super_t::reference dereference() const {
+    return *(begin_ + (((this->base() - begin_) / post_) % n_));
+  }
+};
+#endif
+
+template <typename Functor, typename T, typename DeviceContext>
+class TransformFunctor {
+ public:
+  TransformFunctor(const framework::Tensor* x, const framework::Tensor* y,
+                   framework::Tensor* z, const DeviceContext& ctx, Functor func)
+      : x_(x->data<T>()),
+        y_(y->data<T>()),
+        z_(z->mutable_data<T>(ctx.GetPlace())),
+        nx_(x->numel()),
+        ctx_(ctx),
+        func_(func) {}
+
+  inline void Run() const {
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, x_, x_ + nx_, y_, z_, func_);
+  }
+
+  inline void RunRowWise(int n, int pre) const {
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator<T, DeviceContext>(y_, n),
+          z_, func_);
+  }
+
+  inline void RunMidWise(int n, int pre, int post) const {
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, x_, x_ + nx_,
+          MidWiseTransformIterator<T, DeviceContext>(y_, n, post), z_, func_);
+  }
+
+ private:
+  const T* x_;
+  const T* y_;
+  T* z_;
+  int64_t nx_;
+  const DeviceContext& ctx_;
+  Functor func_;
+};
+
 #define EIGEN_FUNCTOR(name, eigen_op)                                          \
   struct Eigen##name##Functor {                                                \
-    template <typename Place, typename T>                                      \
+    template <typename DeviceContext, typename T>                              \
     inline void Run(const framework::Tensor* x, const framework::Tensor* y,    \
                     framework::Tensor* z,                                      \
                     const framework::ExecutionContext& ctx) {                  \
       auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
       auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
       auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_e);            \
+      z_e.device(                                                              \
+          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
+          eigen_op(x_e, y_e);                                                  \
     }                                                                          \
-    template <typename Place, typename T>                                      \
+    template <typename DeviceContext, typename T>                              \
     inline void RunBroadCast(const framework::Tensor* x,                       \
                              const framework::Tensor* y, framework::Tensor* z, \
                              const framework::ExecutionContext& ctx, int pre,  \
@@ -76,9 +238,11 @@ inline void get_mid_dims(const framework::DDim& x_dims,
       auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))                  \
                          .broadcast(Eigen::DSizes<int, 2>(pre, 1))             \
                          .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
+      z_e.device(                                                              \
+          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
+          eigen_op(x_e, y_bcast);                                              \
     }                                                                          \
-    template <typename Place, typename T>                                      \
+    template <typename DeviceContext, typename T>                              \
     inline void RunBroadCast2(const framework::Tensor* x,                      \
                               const framework::Tensor* y,                      \
                               framework::Tensor* z,                            \
@@ -90,11 +254,13 @@ inline void get_mid_dims(const framework::DDim& x_dims,
       auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))               \
                          .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))       \
                          .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
+      z_e.device(                                                              \
+          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
+          eigen_op(x_e, y_bcast);                                              \
     }                                                                          \
   }
 
-template <class functor, typename Place, typename T>
+template <class functor, typename DeviceContext, typename T>
 void ElementwiseCompute(const framework::ExecutionContext& ctx) {
   using Tensor = framework::Tensor;
 
@@ -106,11 +272,11 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) {
   auto x_dims = x->dims();
   auto y_dims = y->dims();
   PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                    "Rank of first input must >= rank of second input.")
+                    "Rank of first input must >= rank of second input.");
 
   if (x_dims == y_dims) {
     functor f;
-    f.template Run<Place, T>(x, y, z, ctx);
+    f.template Run<DeviceContext, T>(x, y, z, ctx);
     return;
   }
 
@@ -123,11 +289,11 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) {
   get_mid_dims(x_dims, y_dims, axis, pre, n, post);
   if (post == 1) {
     functor f;
-    f.template RunBroadCast<Place, T>(x, y, z, ctx, pre, n);
+    f.template RunBroadCast<DeviceContext, T>(x, y, z, ctx, pre, n);
     return;
   } else {
     functor f;
-    f.template RunBroadCast2<Place, T>(x, y, z, ctx, pre, n, post);
+    f.template RunBroadCast2<DeviceContext, T>(x, y, z, ctx, pre, n, post);
     return;
   }
 }
@@ -144,7 +310,7 @@ EIGEN_FUNCTOR(Mul, EIGEN_MUL);
 #define EIGEN_DIV(x, y) ((x) / (y))
 EIGEN_FUNCTOR(Div, EIGEN_DIV);
 
-template <typename Place, typename T, typename functor, typename functor1,
+template <typename DeviceContext, typename T, typename functor,
           typename broadcastfunctor, typename broadcast2functor>
 void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
   using Tensor = framework::Tensor;
@@ -154,7 +320,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
   auto* out = ctx.Input<Tensor>("Out");
   auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
-  auto place = ctx.GetEigenDevice<Place>();
+  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
   auto x_dims = x->dims();
   auto y_dims = y->dims();
diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/operators/elementwise_sub_op.cc
index 95d7979e39bfe7b484acb7771d1bd078014293a2..d3c51f0a697b7cb07a46871cdba2e84e902fd0f2 100644
--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/operators/elementwise_sub_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/elementwise_sub_op.h"
 #include "paddle/operators/elementwise_op.h"
@@ -19,10 +19,9 @@ namespace paddle {
 namespace operators {
 class ElementwiseSubOpMaker : public ElementwiseOpMaker {
  public:
-  ElementwiseSubOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseSubOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Sub", "$Out = X - Y$");
+    SetComment("Sub", "Out = X - Y");
     AddComment(comment_);
   }
 };
@@ -34,13 +33,13 @@ REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
             elementwise_sub_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_sub_op.cu b/paddle/operators/elementwise_sub_op.cu
index ec23bec35feae26f5463c575b1ab6f58d417e100..7a2516ef6a6d5739e9f122455d289cbfeaaf2549 100644
--- a/paddle/operators/elementwise_sub_op.cu
+++ b/paddle/operators/elementwise_sub_op.cu
@@ -1,31 +1,32 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/elementwise_sub_op.h"
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, int64_t>);
-REGISTER_OP_GPU_KERNEL(
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
     elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, int64_t>);
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h
index 3f40c1c5bcea5e8473765b039de4ee2a16054f0c..347e92f87c7a46d3935f69864718d63661bef05f 100644
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/operators/elementwise_op_function.h"
@@ -18,11 +18,11 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseSubKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseCompute<EigenSubFunctor, Place, T>(ctx);
+    ElementwiseCompute<EigenSubFunctor, DeviceContext, T>(ctx);
   }
 };
 
@@ -43,23 +43,6 @@ struct ElementwiseSubGradFunctor {
   }
 };
 
-template <typename T>
-struct ElementwiseSubOneGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (-1.0) * dz_e.sum();
-    }
-  }
-};
-
 template <typename T>
 struct ElementwiseSubBroadCastGradFunctor {
   template <typename Device, typename X, typename Y, typename Z, typename dX,
@@ -101,12 +84,11 @@ struct ElementwiseSubBroadCast2GradFunctor {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseSubGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseGradCompute<Place, T, ElementwiseSubGradFunctor<T>,
-                           ElementwiseSubOneGradFunctor<T>,
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>,
                            ElementwiseSubBroadCastGradFunctor<T>,
                            ElementwiseSubBroadCast2GradFunctor<T>>(ctx);
   }
diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc
index 282775fcda45fe3bbd72bf04a7ae828f2c840ab7..08fa91ed72aa41ed2f513c090b9085410bb5cc47 100644
--- a/paddle/operators/expand_op.cc
+++ b/paddle/operators/expand_op.cc
@@ -55,7 +55,7 @@ class ExpandOp : public framework::OperatorWithKernel {
 
 class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ExpandOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  ExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."
@@ -130,7 +130,8 @@ class ExpandGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
             ops::ExpandGradOp);
-REGISTER_OP_CPU_KERNEL(expand,
-                       ops::ExpandKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    expand_grad, ops::ExpandGradKernel<paddle::platform::CPUPlace, float>);
+    expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    expand_grad,
+    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/expand_op.cu b/paddle/operators/expand_op.cu
index 6744562b6c21dd8bfeb7e4cb6b809dc7913aa3a5..84e8fa567b80599d9687fed516eac6fbb308b24a 100644
--- a/paddle/operators/expand_op.cu
+++ b/paddle/operators/expand_op.cu
@@ -1,23 +1,24 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 
 #include "paddle/operators/expand_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(expand,
-                       ops::ExpandKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    expand_grad, ops::ExpandGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_grad,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h
index 8ae2c11a5d31dafc1b90d129054ebfabfb761bfe..a4994cf3a5becb8198f01f9252f3e96beba5a3c3 100644
--- a/paddle/operators/expand_op.h
+++ b/paddle/operators/expand_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   You may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -56,7 +56,7 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -83,12 +83,13 @@ class ExpandKernel : public framework::OpKernel<T> {
     auto x = EigenTensor<T, Rank>::From(*in0);
     out0->mutable_data<T>(context.GetPlace());
     auto y = EigenTensor<T, Rank>::From(*out0);
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     y.device(place) = x.broadcast(bcast_dims);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -125,7 +126,7 @@ class ExpandGradKernel : public framework::OpKernel<T> {
       auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
       auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
       out0->mutable_data<T>(context.GetPlace());
-      out0->CopyFrom(*in0, context.GetPlace(), context.device_context());
+      framework::Copy(*in0, context.GetPlace(), context.device_context(), out0);
     } else {
       switch (dims) {
         REP_EXPAND_GRAD_TEMPLATE(72)
@@ -163,7 +164,8 @@ class ExpandGradKernel : public framework::OpKernel<T> {
       reduce_dims[i] = reduce_dims_vec[i];
     }
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(context.GetEigenDevice<Place>()) =
+    x_grad.device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
         out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions());
   }
 };
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index 0dd84cbeaafbafd45132b0a0b744554ce7475411..d738e1850ca4f658f4fca5c9bf643c44f676cce9 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/op_registry.h"
@@ -25,7 +25,7 @@ class FeedOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto feed_var_name = Input("X");
     auto *feed_var = scope.FindVar(feed_var_name);
 
@@ -47,15 +47,19 @@ class FeedOp : public framework::OperatorBase {
     auto &feed_list = feed_var->Get<framework::FeedFetchList>();
     auto &feed_item = feed_list.at(static_cast<size_t>(col));
     auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
-    out_item->CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx);
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    framework::Copy(feed_item, place, dev_ctx, out_item);
     out_item->set_lod(feed_item.lod());
   }
 };
 
 class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FeedOpInfoMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  FeedOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of feed op");
     AddOutput("Out", "The output of feed op");
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index 8108ae69dec4bafd1c04d5ab05eef6f467d4c6e8..7205ee2a879dfff711ad1cabebe197ef53377a1c 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -1,19 +1,20 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -26,7 +27,7 @@ class FetchOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto fetch_var_name = Input("X");
     auto *fetch_var = scope.FindVar(fetch_var_name);
     PADDLE_ENFORCE(fetch_var != nullptr,
@@ -51,7 +52,10 @@ class FetchOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): Should we assume the fetch operator always generate
     // CPU outputs?
-    dst_item.CopyFrom(src_item, platform::CPUPlace(), dev_ctx);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(src_item.place());
+
+    Copy(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
     dev_ctx.Wait();
     dst_item.set_lod(src_item.lod());
 
@@ -61,8 +65,7 @@ class FetchOp : public framework::OperatorBase {
 
 class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FetchOpInfoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  FetchOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of fetch op");
     AddOutput("Out", "The output of fetch op");
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc
index 985b5d1e865e513d833bff72dcd20a8f20851d8c..c74a5b6ced3af27847dd36511aeab0ee9614415a 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -49,10 +49,10 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("data_type")),
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
         ctx.device_context());
   }
 };
@@ -60,13 +60,12 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
 class FillConstantBatchSizeLikeOpMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  FillConstantBatchSizeLikeOpMaker(framework::OpProto *proto,
-                                   framework::OpAttrChecker *op_checker)
+  FillConstantBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<int>("data_type",
+    AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
     AddInput("Input",
              "(Tensor) Tensor "
              "whose dim_idx th dimension is used to specify the batch_size");
@@ -100,8 +99,11 @@ REGISTER_OPERATOR(fill_constant_batch_size_like,
                   ops::FillConstantBatchSizeLikeOpMaker);
 REGISTER_OP_CPU_KERNEL(
     fill_constant_batch_size_like,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, double>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, int>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
+                                           float>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
+                                           double>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
+                                           int>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
                                            int64_t>);
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
index 9e7a1eeab863c962ca72908e561e12a04d5021c5..608f4b91623e4ddf0240c37be7a8e56117dd40f2 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cu.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
@@ -1,25 +1,28 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/fill_constant_batch_size_like_op.h"
 #include "paddle/framework/op_registry.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     fill_constant_batch_size_like,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, double>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, int>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           float>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           double>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           int>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
                                            int64_t>);
diff --git a/paddle/operators/fill_constant_batch_size_like_op.h b/paddle/operators/fill_constant_batch_size_like_op.h
index 339d97a30a5819ab488e83990651ba99212239ec..66da9d0307e36db3726f30518c8c57a923e54388 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/operators/fill_constant_batch_size_like_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -27,8 +27,9 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(ctx.GetPlace());
     auto value = ctx.Attr<float>("value");
 
-    math::SetConstant<Place, T> setter;
-    setter(ctx.device_context(), out, static_cast<T>(value));
+    math::SetConstant<DeviceContext, T> setter;
+    setter(ctx.template device_context<DeviceContext>(), out,
+           static_cast<T>(value));
   }
 };
 
diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc
index 818f113b90a4c239a857791fb9957e51d3287b97..dcd43a30c86b62d79f52ac640f14b295a062146c 100644
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -33,8 +34,9 @@ class FillConstantOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
-    auto data_type = static_cast<framework::DataType>(Attr<int>("data_type"));
+           const platform::Place &dev_place) const override {
+    auto data_type =
+        static_cast<framework::proto::DataType>(Attr<int>("dtype"));
     auto value = Attr<float>("value");
     auto force_cpu = Attr<bool>("force_cpu");
     auto &out =
@@ -44,21 +46,23 @@ class FillConstantOp : public framework::OperatorBase {
       auto cpu = platform::CPUPlace();
       out.mutable_data(cpu, framework::ToTypeIndex(data_type));
     } else {
-      out.mutable_data(dev_ctx.GetPlace(), framework::ToTypeIndex(data_type));
+      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
     }
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
     math::set_constant(dev_ctx, &out, value);
   }
 };
 
 class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FillConstantOpMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  FillConstantOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<int>("data_type",
+    AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
     AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
     AddAttr<float>("value", "(float, default 0) The value to be filled")
         .SetDefault(0.0f);
diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f5a2ed169565771629fe8df7c25cf23bc94e339
--- /dev/null
+++ b/paddle/operators/fill_op.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+struct FillOpVisitor {
+  FillOpVisitor(framework::LoDTensor *tensor, const std::vector<float> &value)
+      : tensor_(tensor), value_(value) {}
+
+  template <typename T>
+  void operator()() const {
+    platform::CPUPlace cpu;
+    auto *data = tensor_->mutable_data<T>(cpu);
+    std::transform(value_.data(), value_.data() + tensor_->numel(), data,
+                   [](float dat) { return static_cast<T>(dat); });
+  }
+
+  framework::LoDTensor *tensor_;
+  const std::vector<float> &value_;
+};
+
+class FillOp : public framework::OperatorBase {
+ public:
+  FillOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto &out =
+        detail::Ref(detail::Ref(scope.FindVar(Output("Out")),
+                                "Cannot find variable %s", Output("Out"))
+                        .GetMutable<framework::LoDTensor>());
+    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    auto dtype = static_cast<framework::proto::DataType>(Attr<int>("dtype"));
+    platform::CPUPlace cpu;
+    auto force_cpu = Attr<bool>("force_cpu");
+    out.mutable_data(force_cpu ? cpu : place, framework::ToTypeIndex(dtype));
+
+    framework::LoDTensor tensor;
+
+    if (force_cpu || platform::is_cpu_place(place)) {
+      tensor.ShareDataWith(out);
+    } else {
+      // Always make tensor in CPU memory.
+      tensor.Resize(out.dims());
+      tensor.mutable_data(cpu, framework::ToTypeIndex(dtype));
+    }
+
+    framework::VisitDataType(
+        dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
+
+    if (!force_cpu && platform::is_gpu_place(place)) {
+      // Copy tensor to out
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
+      framework::Copy(tensor, place, dev_ctx, &out);
+    }
+  }
+};
+
+class FillOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FillOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddComment(R"DOC(Fill operator
+
+Fill an tensor with `value` and `shape`. The type of the tensor is specify by
+`dtype`.
+)DOC");
+    AddOutput("Out", "(LoDTensor) The output tensor.");
+    AddAttr<std::vector<float>>(
+        "value", "The float values of tensor, which are flatten in row major");
+    AddAttr<std::vector<int>>("shape", "The shape of output tensor");
+    AddAttr<int>("dtype", "The data type of output tensor, Default is float")
+        .SetDefault(framework::proto::DataType::FP32);
+    AddAttr<bool>("force_cpu",
+                  "Whether the output tensor must be at CPU memory or not. "
+                  "Default is false.")
+        .SetDefault(false);
+  }
+};
+
+class FillOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim(
+        "Out",
+        framework::make_ddim(context->Attrs().Get<std::vector<int>>("shape")));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fill, ops::FillOp, ops::FillOpInferShape, ops::FillOpMaker);
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
index 95fb5932b8b555e1357adc9fdfb7b6e6db7da71d..b4ae1de876010effff6bf577a4e33043f6760a4f 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -24,20 +24,19 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of FillZerosLikeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"),
-                   "Output(Y) of FillZerosLikeOp should not be null.");
-    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Y");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FillZerosLikeOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
 class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FillZerosLikeOpMaker(framework::OpProto *proto,
-                       framework::OpAttrChecker *op_checker)
+  FillZerosLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of fill-zeros-like op.");
-    AddOutput("Y", "The variable will be filled up with zeros.");
+    AddOutput("Out", "The variable will be filled up with zeros.");
     AddComment(R"DOC(
 FillZerosLike Operator.
 
@@ -54,8 +53,9 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp,
                              ops::FillZerosLikeOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    fill_zeros_like, ops::FillZerosLikeKernel<paddle::platform::CPUPlace, int>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, int64_t>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, float>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, double>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, bool>);
+    fill_zeros_like,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
diff --git a/paddle/operators/fill_zeros_like_op.cu.cc b/paddle/operators/fill_zeros_like_op.cu.cc
index 1501a17441072223ba0e8cf5b6c8cdd5e903a467..b7048e8f5857e646e16d5017593f5d3c6e79ea7e 100644
--- a/paddle/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/operators/fill_zeros_like_op.cu.cc
@@ -1,24 +1,25 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/fill_zeros_like_op.h"
 #include "paddle/framework/op_registry.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    fill_zeros_like, ops::FillZerosLikeKernel<paddle::platform::GPUPlace, int>,
-    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, int64_t>,
-    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, float>,
-    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, double>,
-    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    fill_zeros_like,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
index 7e7d78eea2bce427d6ad4dfb77bcb4ace35cd287..351ecf8b2f1d945fabdd1d6c5ed56f76f3caae61 100644
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -19,15 +19,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class FillZerosLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Y");
+    auto* out = context.Output<framework::Tensor>("Out");
     out->mutable_data<T>(context.GetPlace());
 
-    math::SetConstant<Place, T> setter;
-    setter(context.device_context(), out, static_cast<T>(0));
+    math::SetConstant<DeviceContext, T> setter;
+    setter(context.template device_context<DeviceContext>(), out,
+           static_cast<T>(0));
   }
 };
 
diff --git a/paddle/operators/ftrl_op.cc b/paddle/operators/ftrl_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d00700823d48eb2ea4fc64d1fa2989f18c7c5f18
--- /dev/null
+++ b/paddle/operators/ftrl_op.cc
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/ftrl_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FTRLOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("SquaredAccumulator"),
+                   "Input(SquaredAccumulator) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LinearAccumulator"),
+                   "Input(LinearAccumulator) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of FTRL should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SquaredAccumOut"),
+                   "Output(SquaredAccumOut) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("LinearAccumOut"),
+                   "Output(LinearAccumOut) of FTRL should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"),
+                      "Two input of FTRL Op's dimension must be same.");
+
+    auto lr_dim = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
+                      "Learning Rate should be a scalar.");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("SquaredAccumOut", param_dim);
+    ctx->SetOutputDim("LinearAccumOut", param_dim);
+  }
+};
+
+class FTRLOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FTRLOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param",
+             "(Tensor, default Tensor<float>) "
+             "Input parameter value that has to be updated.");
+    AddInput("SquaredAccumulator",
+             "(Tensor, default Tensor<float>) "
+             "Accumulator that accumulates squared gradients.");
+    AddInput("LinearAccumulator",
+             "(Tensor, default Tensor<float>) "
+             "Accumulator that accumulates linear gradients.");
+    AddInput("Grad",
+             "(Tensor, default Tensor<float>) "
+             "Input gradient of the parameter.");
+    AddInput("LearningRate",
+             "(Tensor, default Tensor<float>) "
+             "The learning rate should be a tensor of size 1.");
+
+    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
+    AddOutput("SquaredAccumOut",
+              "(Tensor) Output accumulated squared"
+              " gradients.");
+    AddOutput("LinearAccumOut",
+              "(Tensor) Output accumulated linear"
+              " gradients.");
+
+    AddAttr<float>("l1",
+                   "(float, default 0.0) "
+                   "L1 regularization strength.")
+        .SetDefault(0.0f);
+    AddAttr<float>("l2",
+                   "(float, default 0.0) "
+                   "L2 regularization strength.")
+        .SetDefault(0.0f);
+    AddAttr<float>("lr_power",
+                   "(float, default -0.5f) "
+                   "Learning Rate Power.")
+        .SetDefault(-0.5f);
+    AddComment(R"DOC(
+FTRL (Follow The Regularized Leader) Operator.
+
+Optimizer that implements the FTRL algorithm:
+
+$$
+new\_accum = squared\_accum + grad^2 \\
+if (lr\_power == -0.5) {
+   linear\_accum += grad - (\surd(new\_accum) - \surd(squared\_accum)) /
+                   (learning\_rate * param) \\
+} else {
+   linear\_accum += grad -
+                  (new\_accum^{-lr\_power} - accum^{-lr\_power}) /
+                  (learning\_rate * param) \\
+}
+
+x = (l1 * sign(linear\_accum) - linear\_accum)
+if (lr\_power == -0.5) {
+   y = \frac{\surd(new\_accum)}{learning\_rate} + (2 * l2) \\
+   pre\_shrink = \frac{x}{y} \\
+   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\
+} else {
+   y = \frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2) \\
+   pre\_shrink = \frac{x}{y} \\
+   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\
+}
+squared\_accum += grad^2;
+$$
+
+The paper that proposed Follow The Regularized Leader (FTRL):
+(https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(ftrl, ops::FTRLOp, ops::FTRLOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    ftrl, ops::FTRLOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/ftrl_op.cu b/paddle/operators/ftrl_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..abbbe7adbe6bd14f55f7f941c5e6740fada24910
--- /dev/null
+++ b/paddle/operators/ftrl_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/ftrl_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    ftrl, ops::FTRLOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/ftrl_op.h b/paddle/operators/ftrl_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4eea04cd8d61bb34fc612e0ca1765a664e329ca9
--- /dev/null
+++ b/paddle/operators/ftrl_op.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class FTRLOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param_out = ctx.Output<Tensor>("ParamOut");
+    auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut");
+    auto* lin_accum_out = ctx.Output<Tensor>("LinearAccumOut");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    sq_accum_out->mutable_data<T>(ctx.GetPlace());
+    lin_accum_out->mutable_data<T>(ctx.GetPlace());
+
+    auto grad = ctx.Input<Tensor>("Grad");
+
+    auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
+    auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
+    auto lr_power = static_cast<T>(ctx.Attr<float>("lr_power"));
+
+    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
+    auto sq_accum =
+        EigenVector<T>::Flatten(*ctx.Input<Tensor>("SquaredAccumulator"));
+    auto lin_accum =
+        EigenVector<T>::Flatten(*ctx.Input<Tensor>("LinearAccumulator"));
+    auto g = EigenVector<T>::Flatten(*grad);
+    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
+
+    auto p_out = EigenVector<T>::Flatten(*param_out);
+    auto s_acc_out = EigenVector<T>::Flatten(*sq_accum_out);
+    auto l_acc_out = EigenVector<T>::Flatten(*lin_accum_out);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+
+    auto new_accum = sq_accum + g * g;
+    // Special case for lr_power = -0.5
+    if (lr_power == static_cast<T>(-0.5)) {
+      l_acc_out.device(place) =
+          lin_accum + g -
+          ((new_accum.sqrt() - sq_accum.sqrt()) / lr.broadcast(grad_dsize)) * p;
+    } else {
+      l_acc_out.device(place) =
+          lin_accum + g -
+          ((new_accum.pow(-lr_power) - sq_accum.pow(-lr_power)) /
+           lr.broadcast(grad_dsize)) *
+              p;
+    }
+
+    auto x = (l_acc_out.constant(l1) * l_acc_out.sign() - l_acc_out);
+    if (lr_power == static_cast<T>(-0.5)) {
+      auto y = (new_accum.sqrt() / lr.broadcast(grad_dsize)) +
+               l_acc_out.constant(static_cast<T>(2) * l2);
+      auto pre_shrink = x / y;
+      p_out.device(place) =
+          (l_acc_out.abs() > l_acc_out.constant(l1))
+              .select(pre_shrink, p.constant(static_cast<T>(0)));
+    } else {
+      auto y = (new_accum.pow(-lr_power) / lr.broadcast(grad_dsize)) +
+               l_acc_out.constant(static_cast<T>(2) * l2);
+      auto pre_shrink = x / y;
+      p_out.device(place) =
+          (l_acc_out.abs() > l_acc_out.constant(l1))
+              .select(pre_shrink, p.constant(static_cast<T>(0)));
+    }
+
+    s_acc_out.device(place) = sq_accum + g * g;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gather.cu.h b/paddle/operators/gather.cu.h
index 8d04ecd284226c7b4c6cdd5531915fee2d94ce61..9840c066f053e5e1cd1c756d4dd938eace1a5eb4 100644
--- a/paddle/operators/gather.cu.h
+++ b/paddle/operators/gather.cu.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/tensor.h"
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 using framework::Tensor;
-using platform::Place;
+using platform::DeviceContext;
 
 #define CUDA_1D_KERNEL_LOOP(i, n)                              \
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index 8f80fb162519f60fcce897b3c31a3507bbf6ba6d..597fdad0794ec076669a0b2e6157e5c74c80d735 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -40,7 +40,7 @@ class GatherOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
@@ -57,7 +57,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
@@ -67,7 +67,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
 
 class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GatherOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  GatherOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The source input of gather op");
     AddInput("Index", "The index input of gather op");
diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu
index 92219d6a433e6db0bb9886ed8670cbafaa843ff8..eec2415e1de2434de0a920567863d421d2d3032d 100644
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "gather.cu.h"
 #include "paddle/framework/eigen.h"
@@ -49,7 +49,8 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
 
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto place = ctx.GetEigenDevice<platform::GPUPlace>();
+    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
+                       .eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
 
     GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
@@ -60,5 +61,5 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
-REGISTER_OP_GPU_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
diff --git a/paddle/operators/gather_op.h b/paddle/operators/gather_op.h
index 8276ed0d3d8b676aafab45fae70942e78b72b8e6..1a1ba0c41aef95d3dc8cc929db72770a7bd08b18 100644
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
@@ -53,7 +53,8 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
 
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto place = ctx.GetEigenDevice<platform::CPUPlace>();
+    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
 
     ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index 53ad86c6c48d1868f4495af51661d91b39a84f0b..2dca05760ecc98f822b8c426af3152d365526d2b 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -1,13 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <random>
 #include "paddle/framework/op_registry.h"
@@ -57,18 +60,17 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("data_type")),
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
         ctx.device_context());
   }
 };
 
 class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GaussianRandomOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  GaussianRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Out", "Output matrix of gaussian random op");
 
@@ -88,10 +90,10 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
                  "Random seed of generator."
                  "0 means use system wide seed.")
         .SetDefault(0);
-    AddAttr<int>("data_type",
+    AddAttr<int>("dtype",
                  "(int, default 5(FP32)) "
                  "Output data type.")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
 
     AddComment(R"DOC(
 GaussianRandom Operator.
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
index 315560bf1ba8a66b9a3b7d79510d202885e845d6..8a70db17e17ebf7d5bad1e1ee6a2acdff1b85a09 100644
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
 
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/counting_iterator.h>
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include "paddle/framework/op_registry.h"
@@ -60,5 +60,5 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(gaussian_random,
-                       paddle::operators::GPUGaussianRandomKernel<float>);
+REGISTER_OP_CUDA_KERNEL(gaussian_random,
+                        paddle::operators::GPUGaussianRandomKernel<float>);
diff --git a/paddle/operators/get_places_op.cc b/paddle/operators/get_places_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..24fafb23074c358669715f1840246c3520f948c7
--- /dev/null
+++ b/paddle/operators/get_places_op.cc
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thread>
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/place.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/platform/gpu_info.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+static size_t CUDADevCount() {
+#ifdef PADDLE_WITH_CUDA
+  return platform::GetCUDADeviceCount();
+#else
+  return 0UL;
+#endif
+}
+
+class GetPlacesOp : public framework::OperatorBase {
+ public:
+  GetPlacesOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    bool is_gpu;
+    if (Attr<std::string>("device_type") == "AUTO") {
+      is_gpu = platform::is_gpu_place(place);
+    } else {
+      is_gpu = Attr<std::string>("device_type") == "CUDA";
+    }
+    auto device_count = static_cast<size_t>(Attr<int>("device_count"));
+    if (device_count == 0) {
+      device_count =
+          is_gpu ? CUDADevCount() : std::thread::hardware_concurrency();
+    }
+    PADDLE_ENFORCE_NE(device_count, 0, "Cannot indicate %s device count",
+                      is_gpu ? "GPU" : "CPU");
+
+    auto out_var_name = Output("Out");
+    auto &places =
+        *(detail::Ref(scope.FindVar(out_var_name),
+                      "Output variable %s cannot be found", out_var_name)
+              .GetMutable<platform::PlaceList>());
+    places.reserve(device_count);
+    if (is_gpu) {
+      PADDLE_ENFORCE_LE(device_count, CUDADevCount(),
+                        "Only %d CUDA devices found, cannot set to %d",
+                        CUDADevCount(), device_count);
+      for (size_t i = 0; i < device_count; ++i) {
+        places.emplace_back(platform::CUDAPlace(static_cast<int>(i)));
+      }
+    } else {
+      for (size_t i = 0; i < device_count; ++i) {
+        places.emplace_back(platform::CPUPlace());
+      }
+    }
+  }
+};
+
+class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GetPlacesOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "vector of Place");
+    AddAttr<int>("device_count", "device count").SetDefault(0);
+    AddAttr<std::string>("device_type", "device type")
+        .InEnum({"CUDA", "CPU", "AUTO"})
+        .SetDefault("AUTO");
+    AddComment(R"DOC(
+Returns a list of places based on flags. The list will be used for parallel
+execution.
+)DOC");
+  }
+};
+
+class GetPlacesInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &o_name : op_desc.Output("Out")) {
+      block->FindRecursiveOrCreateVar(o_name).SetType(
+          framework::proto::VarDesc::PLACE_LIST);
+    }
+  }
+};
+
+class GetPlacesInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    // Do nothing
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(get_places, ops::GetPlacesOp, ops::GetPlacesOpProtoMaker,
+                  ops::GetPlacesInferVarType, ops::GetPlacesInferShape,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/gru_op.cc b/paddle/operators/gru_op.cc
index 5aa03f8916a67222fb0ca5781533766063e52683..76f2adefede3b4bc4035f86f8f8663eed29343ae 100644
--- a/paddle/operators/gru_op.cc
+++ b/paddle/operators/gru_op.cc
@@ -1,13 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/gru_op.h"
 
@@ -67,7 +70,7 @@ class GRUOp : public framework::OperatorWithKernel {
 
 class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GRUOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  GRUOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Input",
              "(LoDTensor) The first input is a LodTensor, which supports "
@@ -213,8 +216,9 @@ class GRUGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
-REGISTER_OP_CPU_KERNEL(gru, ops::GRUKernel<paddle::platform::CPUPlace, float>,
-                       ops::GRUKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(gru_grad,
-                       ops::GRUGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::GRUGradKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    gru, ops::GRUKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    gru_grad, ops::GRUGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/gru_op.cu.cc b/paddle/operators/gru_op.cu.cc
index 0ceff94ec3ddaadbd5f0ca4f5a4eebe6cb8ee3a9..9cb0cc42d5589792aae6d99cec807aac6e4991b6 100644
--- a/paddle/operators/gru_op.cu.cc
+++ b/paddle/operators/gru_op.cu.cc
@@ -1,22 +1,23 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/gru_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(gru, ops::GRUKernel<paddle::platform::GPUPlace, float>,
-                       ops::GRUKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(gru_grad,
-                       ops::GRUGradKernel<paddle::platform::GPUPlace, float>,
-                       ops::GRUGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    gru, ops::GRUKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    gru_grad, ops::GRUGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h
index 1b18368e0e16365682520b62a7f6adab0cbb527f..b1957fb9ce6add8628cb206abf2c569d3f615c85 100644
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@@ -1,19 +1,20 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
+#include "paddle/operators/math/detail/activation_functions.h"
 #include "paddle/operators/math/gru_compute.h"
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/sequence2batch.h"
@@ -27,16 +28,16 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
-inline void ReorderInitState(const platform::DeviceContext& ctx,
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
                              const framework::Tensor& src, const size_t* index,
                              framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<Place, T> row_shuffle;
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
   row_shuffle(ctx, src, index, *dst, indexed_src);
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GRUKernel : public framework::OpKernel<T> {
  public:
   void BatchCompute(const framework::ExecutionContext& context) const {
@@ -60,19 +61,19 @@ class GRUKernel : public framework::OpKernel<T> {
     auto hidden_dims = hidden->dims();
 
     bool is_reverse = context.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<Place, T> to_batch;
-    auto& dev_ctx = context.device_context();
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     to_batch(dev_ctx, *input, *batch_gate, true, is_reverse);
 
     if (bias) {
-      math::RowwiseAdd<Place, T> add_bias;
+      math::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
     }
 
     int frame_size = hidden_dims[1];
-    math::hl_gru_value<T> gru_value;
-    gru_value.gateWeight = const_cast<T*>(weight_data);
-    gru_value.stateWeight =
+    math::GRUMetaValue<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
     Tensor ordered_h0;
     const size_t* order = batch_gate->lod()[2].data();
@@ -80,14 +81,19 @@ class GRUKernel : public framework::OpKernel<T> {
       // Since the batch computing for GRU reorders the input sequences
       // according to their length. The initialized cell state also needs
       // to reorder.
-      ReorderInitState<Place, T>(context.device_context(), *h0, order,
-                                 &ordered_h0, true);
-      gru_value.prevOutValue = ordered_h0.data<T>();
+      ReorderInitState<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), *h0, order,
+          &ordered_h0, true);
+      gru_value.prev_out_value = ordered_h0.data<T>();
     } else {
-      gru_value.prevOutValue = nullptr;
+      gru_value.prev_out_value = nullptr;
     }
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
+    auto active_node = math::detail::GetActivationType(
+        context.Attr<std::string>("activation"));
+    auto active_gate = math::detail::GetActivationType(
+        context.Attr<std::string>("gate_activation"));
     for (size_t n = 0; n < num_batch; n++) {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
@@ -96,17 +102,16 @@ class GRUKernel : public framework::OpKernel<T> {
       Tensor gate_t = batch_gate->Slice(bstart, bend);
       Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
       Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-      gru_value.outputValue = hidden_t.data<T>();
-      gru_value.gateValue = gate_t.data<T>();
-      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
-      math::GRUUnitFunctor<Place, T>::compute(
-          dev_ctx, gru_value, frame_size, cur_batch_size,
-          math::ActiveType(context.Attr<std::string>("activation")),
-          math::ActiveType(context.Attr<std::string>("gate_activation")));
-      gru_value.prevOutValue = gru_value.outputValue;
+      gru_value.output_value = hidden_t.data<T>();
+      gru_value.gate_value = gate_t.data<T>();
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+      math::GRUUnitFunctor<DeviceContext, T>::compute(
+          dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
+          active_gate);
+      gru_value.prev_out_value = gru_value.output_value;
     }
 
-    math::Batch2LoDTensorFunctor<Place, T> to_seq;
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_hidden->set_lod(batch_gate->lod());
     to_seq(dev_ctx, *batch_hidden, *hidden);
   }
@@ -116,7 +121,7 @@ class GRUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GRUGradKernel : public framework::OpKernel<T> {
  public:
   void BatchCompute(const framework::ExecutionContext& context) const {
@@ -141,14 +146,14 @@ class GRUGradKernel : public framework::OpKernel<T> {
     auto hidden_dims = hidden->dims();
     int frame_size = hidden_dims[1];
 
-    math::LoDTensor2BatchFunctor<Place, T> to_batch;
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad;
     batch_hidden_grad.mutable_data<T>(hidden_dims, context.GetPlace());
     batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
     batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
                                                  context.GetPlace());
-    math::SetConstant<Place, T> zero;
-    auto& dev_ctx = context.device_context();
+    math::SetConstant<DeviceContext, T> zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     zero(dev_ctx, &batch_hidden_grad, static_cast<T>(0.0));
     zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
     zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
@@ -156,85 +161,89 @@ class GRUGradKernel : public framework::OpKernel<T> {
     Tensor ordered_h0, ordered_h0_grad;
     const size_t* order = batch_gate->lod()[2].data();
     if (h0) {
-      ReorderInitState<Place, T>(context.device_context(), *h0, order,
-                                 &ordered_h0, true);
+      ReorderInitState<DeviceContext, T>(dev_ctx, *h0, order, &ordered_h0,
+                                         true);
     }
     if (h0_grad) {
       ordered_h0_grad.mutable_data<T>(h0_grad->dims(), context.GetPlace());
-      zero(context.device_context(), &ordered_h0_grad, static_cast<T>(0.0));
+      zero(context.template device_context<DeviceContext>(), &ordered_h0_grad,
+           static_cast<T>(0.0));
     }
 
     bool is_reverse = context.Attr<bool>("is_reverse");
     batch_hidden_grad.set_lod(batch_hidden->lod());
     to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
 
-    math::hl_gru_value<T> gru_value;
-    gru_value.gateWeight = const_cast<T*>(weight_data);
-    gru_value.stateWeight =
+    math::GRUMetaValue<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
 
-    math::hl_gru_grad<T> gru_grad;
+    math::GRUMetaGrad<T> gru_grad;
     if (weight_grad) {
-      gru_grad.gateWeightGrad =
+      gru_grad.gate_weight_grad =
           weight_grad->mutable_data<T>(context.GetPlace());
       zero(dev_ctx, weight_grad, static_cast<T>(0.0));
-      gru_grad.stateWeightGrad =
+      gru_grad.state_weight_grad =
           weight_grad->data<T>() + 2 * frame_size * frame_size;
     } else {
-      gru_grad.gateWeightGrad = nullptr;
-      gru_grad.stateWeightGrad = nullptr;
+      gru_grad.gate_weight_grad = nullptr;
+      gru_grad.state_weight_grad = nullptr;
     }
 
     auto batch_starts = batch_hidden_grad.lod()[0];
     size_t num_batch = batch_starts.size() - 1;
+    auto active_node = math::detail::GetActivationType(
+        context.Attr<std::string>("activation"));
+    auto active_gate = math::detail::GetActivationType(
+        context.Attr<std::string>("gate_activation"));
     for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
       int cur_batch_size = bend - bstart;
 
       Tensor gate_t = batch_gate->Slice(bstart, bend);
-      gru_value.gateValue = gate_t.data<T>();
+      gru_value.gate_value = gate_t.data<T>();
       Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
-      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
 
       Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
-      gru_grad.outputGrad = hidden_grad_t.data<T>();
+      gru_grad.output_grad = hidden_grad_t.data<T>();
       Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
-      gru_grad.gateGrad = gate_grad_t.data<T>();
+      gru_grad.gate_grad = gate_grad_t.data<T>();
       Tensor reset_hidden_prev_grad_t =
           batch_reset_hidden_prev_grad.Slice(bstart, bend);
-      gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data<T>();
+      gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data<T>();
       if (n == 0) {
-        gru_value.prevOutValue = h0 ? ordered_h0.data<T>() : nullptr;
-        gru_grad.prevOutGrad =
+        gru_value.prev_out_value = h0 ? ordered_h0.data<T>() : nullptr;
+        gru_grad.prev_out_grad =
             h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
       } else {
         int bstart_pre = static_cast<int>(batch_starts[n - 1]);
         Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
-        gru_value.prevOutValue = hidden_prev_t.data<T>();
+        gru_value.prev_out_value = hidden_prev_t.data<T>();
         Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
-        gru_grad.prevOutGrad = hidden_prev_grad_t.data<T>();
+        gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
       }
 
-      math::GRUUnitGradFunctor<Place, T>::compute(
-          dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size,
-          math::ActiveType(context.Attr<std::string>("activation")),
-          math::ActiveType(context.Attr<std::string>("gate_activation")));
+      math::GRUUnitGradFunctor<DeviceContext, T>::compute(
+          dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node,
+          active_gate);
     }
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
-      math::Batch2LoDTensorFunctor<Place, T> to_seq;
+      math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
       batch_gate_grad.set_lod(batch_gate->lod());
       to_seq(dev_ctx, batch_gate_grad, *input_grad);
     }
     if (bias_grad) {
       bias_grad->mutable_data<T>(context.GetPlace());
-      math::ColwiseSum<Place, T> col_sum;
+      math::ColwiseSum<DeviceContext, T> col_sum;
       col_sum(dev_ctx, batch_gate_grad, bias_grad);
     }
     if (h0 && h0_grad) {
-      ReorderInitState<Place, T>(context.device_context(), ordered_h0_grad,
-                                 order, h0_grad, false);
+      ReorderInitState<DeviceContext, T>(dev_ctx, ordered_h0_grad, order,
+                                         h0_grad, false);
     }
   }
 
diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc
index 89c027ff1eea93012dc5ab22b081786efc328e96..c354293be7720abd7d96b1c4311b32049a16730c 100644
--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/gru_unit_op.h"
 
@@ -71,8 +71,7 @@ class GRUUnitOp : public framework::OperatorWithKernel {
 
 class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GRUUnitOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  GRUUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Input",
              "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
@@ -114,18 +113,19 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(sigmoid)
         .InEnum({identity, sigmoid, tanh, relu});
     AddComment(R"DOC(
-GRUUnit Operator.
-
-This operator implements partial calculations of the GRU unit as follows:
+GRUUnit Operator implements partial calculations of the GRU unit as following:
 
 $$
-update \ gate: u_t = actGate(xu_t + W_u * hidden_{prev} + bias_u) \\
-reset \ gate: r_t = actGate(xr_t + W_r * hidden_{prev} + bias_r)  \\
-output \ candidate: {h}_t = actNode({xc}_t + W_c * dot(r_t, hidden_{prev}) + bias_c) \\
-output: h_t = dot((1-u_t), {h}_t) + dot(u_t, hidden_{prev})
+update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
 $$
 
-The rest of GRU unit can be completed by using FCOp's output as the input of GRUUnitOp.
+which is same as one time step of GRU Operator.
+
+@note To implement the complete GRU unit, fully-connected operator must be 
+used before to feed xu, xr and xc as the Input of GRUUnit operator.
 
 )DOC");
   }
@@ -150,12 +150,6 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
                    "ResetHiddenPrev");
     PADDLE_ENFORCE(ctx->HasInput("Hidden"),
                    "Input(%s) of GRUUnitGradOp should not be null.", "Hidden");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Gate")),
-                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
-                   "Gate");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("ResetHiddenPrev")),
-                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
-                   "ResetHiddenPrev");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
                    "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
                    "Hidden");
@@ -206,9 +200,10 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
             ops::GRUUnitGradOp);
-REGISTER_OP_CPU_KERNEL(gru_unit,
-                       ops::GRUUnitKernel<paddle::platform::CPUPlace, float>,
-                       ops::GRUUnitKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GRUUnitGradKernel<paddle::platform::CPUPlace, double>);
+    gru_unit, ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    gru_unit_grad,
+    ops::GRUUnitGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUUnitGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/gru_unit_op.cu b/paddle/operators/gru_unit_op.cu
index 821c8c6421771bd99474b0b2f8aa2acf04697779..95c8c23dadadf0e053012c86d44346ee31565cfc 100644
--- a/paddle/operators/gru_unit_op.cu
+++ b/paddle/operators/gru_unit_op.cu
@@ -1,24 +1,25 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/gru_unit_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(gru_unit,
-                       ops::GRUUnitKernel<paddle::platform::GPUPlace, float>,
-                       ops::GRUUnitKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
-    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::GPUPlace, float>,
-    ops::GRUUnitGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    gru_unit, ops::GRUUnitKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUUnitKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    gru_unit_grad,
+    ops::GRUUnitGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUUnitGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h
index c53e7d9827e0395e6ce613302e732b2797f83cdd..a77be46718b766d9a0a8b8fb4cf2316b44687db8 100644
--- a/paddle/operators/gru_unit_op.h
+++ b/paddle/operators/gru_unit_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -28,9 +28,13 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GRUUnitKernel : public framework::OpKernel<T> {
  public:
   template <typename Device, typename X, typename Y>
@@ -67,7 +71,8 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     auto g = EigenMatrix<T>::From(*gate);
     auto r_h_p = EigenMatrix<T>::From(*reset_hidden_prev);
     auto h = EigenMatrix<T>::From(*hidden);
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     // calculate unactivated gate outputs
     if (bias) {
@@ -82,10 +87,10 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     const T* weight_data = weight->data<T>();
     T* gate_data = gate->data<T>();
     T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    math::gemm<Place, T>(context.device_context(), false, false, batch_size,
-                         2 * frame_size, frame_size, 1, hidden_prev_data,
-                         frame_size, weight_data, frame_size * 2, 1, gate_data,
-                         frame_size * 3);
+    math::gemm<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), false, false,
+        batch_size, 2 * frame_size, frame_size, 1, hidden_prev_data, frame_size,
+        weight_data, frame_size * 2, 1, gate_data, frame_size * 3);
 
     // calculate activited gate
     Eigen::array<int, 2> extents({{batch_size, frame_size}});
@@ -98,11 +103,11 @@ class GRUUnitKernel : public framework::OpKernel<T> {
                g.slice(r_offsets, extents), g.slice(r_offsets, extents));
     auto r = g.slice(r_offsets, extents);  // reset gate
     r_h_p.device(place) = r * h_p;         // reset previous hidden state
-    math::gemm<Place, T>(context.device_context(), false, false, batch_size,
-                         frame_size, frame_size, 1, reset_hidden_prev_data,
-                         frame_size, weight_data + frame_size * frame_size * 2,
-                         frame_size, 1, gate_data + frame_size * 2,
-                         frame_size * 3);
+    math::gemm<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), false, false,
+        batch_size, frame_size, frame_size, 1, reset_hidden_prev_data,
+        frame_size, weight_data + frame_size * frame_size * 2, frame_size, 1,
+        gate_data + frame_size * 2, frame_size * 3);
 
     Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
     ActCompute(context.Attr<int>("activation"), place,
@@ -110,11 +115,11 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     auto c = g.slice(c_offsets, extents);  // output candidate
 
     // calculate final output
-    h.device(place) = u * (h_p - c) + c;
+    h.device(place) = u * (c - h_p) + h_p;
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GRUUnitGradKernel : public framework::OpKernel<T> {
  public:
   template <typename Device, typename X, typename Y, typename DX, typename DY>
@@ -146,34 +151,27 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     auto* weight_grad =
         context.Output<Tensor>(framework::GradVarName("Weight"));
     auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
-    input_grad->mutable_data<T>(context.GetPlace());
-    hidden_prev_grad->mutable_data<T>(context.GetPlace());
-    weight_grad->mutable_data<T>(context.GetPlace());
     Tensor gate_grad;
-    gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
     Tensor reset_hidden_prev_grad;
-    reset_hidden_prev_grad.mutable_data<T>(reset_hidden_prev->dims(),
-                                           context.GetPlace());
-
-    int batch_size = input->dims()[0];
-    int frame_size = hidden_prev->dims()[1];
 
     const T* hidden_prev_data = hidden_prev->data<T>();
-    T* hidden_prev_grad_data = hidden_prev_grad->data<T>();
     const T* weight_data = weight->data<T>();
-    T* weight_grad_data = weight_grad->data<T>();
-    T* gate_grad_data = gate_grad.data<T>();
+    T* gate_grad_data =
+        gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
     const T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.data<T>();
+    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data<T>(
+        reset_hidden_prev->dims(), context.GetPlace());
 
     auto h_p = EigenMatrix<T>::From(*hidden_prev);
     auto g = EigenMatrix<T>::From(*gate);
     auto d_h = EigenMatrix<T>::From(*hidden_grad);
-    auto d_x = EigenMatrix<T>::From(*input_grad);
-    auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
     auto d_g = EigenMatrix<T>::From(gate_grad);
     auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    int batch_size = input->dims()[0];
+    int frame_size = hidden_prev->dims()[1];
 
     Eigen::array<int, 2> extents({{batch_size, frame_size}});
     Eigen::array<int, 2> u_offsets({{0, 0}});
@@ -185,42 +183,58 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
 
     // backward for unactivated update gate
     ActGradCompute(context.Attr<int>("gate_activation"), place, u, u,
-                   d_g.slice(u_offsets, extents), d_h * (h_p - c));
+                   d_g.slice(u_offsets, extents), d_h * (c - h_p));
     // backward for unactivated output candidate
     ActGradCompute(context.Attr<int>("activation"), place, c, c,
-                   d_g.slice(c_offsets, extents), d_h * (u.constant(T(1)) - u));
+                   d_g.slice(c_offsets, extents), d_h * u);
     // backward for reset_hidden_prev
-    math::gemm<Place, T>(context.device_context(), false, true, batch_size,
-                         frame_size, frame_size, 1,
-                         gate_grad_data + frame_size * 2, frame_size * 3,
-                         weight_data + frame_size * frame_size * 2, frame_size,
-                         0, reset_hidden_prev_grad_data, frame_size);
-    // backward for state_weight
-    math::gemm<Place, T>(
-        context.device_context(), true, false, frame_size, frame_size,
-        batch_size, 1, reset_hidden_prev_data, frame_size,
-        gate_grad_data + frame_size * 2, frame_size * 3, 0,
-        weight_grad_data + frame_size * frame_size * 2, frame_size);
+    math::gemm<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), false, true,
+        batch_size, frame_size, frame_size, 1, gate_grad_data + frame_size * 2,
+        frame_size * 3, weight_data + frame_size * frame_size * 2, frame_size,
+        0, reset_hidden_prev_grad_data, frame_size);
     // backward for unactivated reset gate
     ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
                    d_g.slice(r_offsets, extents), d_r_h_p * h_p);
-    // backward for update_gate_weight and reset_gate_weight
-    math::gemm<Place, T>(context.device_context(), true, false, frame_size,
-                         frame_size * 2, batch_size, 1, hidden_prev_data,
-                         frame_size, gate_grad_data, frame_size * 3, 0,
-                         weight_grad_data, frame_size * 2);
+    // backward for weight
+    if (weight_grad) {
+      T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
+      // backward for state_weight
+      math::gemm<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), true, false,
+          frame_size, frame_size, batch_size, 1, reset_hidden_prev_data,
+          frame_size, gate_grad_data + frame_size * 2, frame_size * 3, 0,
+          weight_grad_data + frame_size * frame_size * 2, frame_size);
+
+      // backward for update_gate_weight and reset_gate_weight
+      math::gemm<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), true, false,
+          frame_size, frame_size * 2, batch_size, 1, hidden_prev_data,
+          frame_size, gate_grad_data, frame_size * 3, 0, weight_grad_data,
+          frame_size * 2);
+    }
     // backward for hidden_prev
-    d_h_p.device(place) = d_r_h_p * r + d_h * u;
-    math::gemm<Place, T>(context.device_context(), false, true, batch_size,
-                         frame_size, frame_size * 2, 1, gate_grad_data,
-                         frame_size * 3, weight_data, frame_size * 2, 1,
-                         hidden_prev_grad_data, frame_size);
+    if (hidden_prev_grad) {
+      T* hidden_prev_grad_data =
+          hidden_prev_grad->mutable_data<T>(context.GetPlace());
+      auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
+      d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
+      math::gemm<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), false, true,
+          batch_size, frame_size, frame_size * 2, 1, gate_grad_data,
+          frame_size * 3, weight_data, frame_size * 2, 1, hidden_prev_grad_data,
+          frame_size);
+    }
     // backward for input
-    d_x.device(place) = d_g;
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      auto d_x = EigenMatrix<T>::From(*input_grad);
+      d_x.device(place) = d_g;
+    }
     // backward for bias
     if (bias_grad) {
       bias_grad->mutable_data<T>(context.GetPlace());
-      auto d_b = EigenMatrix<T>::From(*bias_grad);
+      auto d_b = EigenVector<T>::Flatten(*bias_grad);
       d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
     }
   }
diff --git a/paddle/operators/hinge_loss_op.cc b/paddle/operators/hinge_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..19d2e9dc56fe11f9dfb13e8cb271a23e128bf91b
--- /dev/null
+++ b/paddle/operators/hinge_loss_op.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/hinge_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class HingeLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) must be initialized.");
+
+    auto pred_dims = ctx->GetInputDim("Logits");
+    auto label_dims = ctx->GetInputDim("Labels");
+
+    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
+                      "The rank of Input(Logits) must be 2 and the shape is "
+                      "[batch_size, 1].");
+    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
+                      "Each row of Input(Logits) contains a real value, "
+                      "so the 2nd dimension of Input(Logits) must be 1.");
+
+    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
+    ctx->ShareLoD("Logits", "Loss");
+  }
+};
+
+template <typename AttrType>
+class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HingeLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Logits",
+             "The input value (Logits) of Hinge loss op."
+             "Logits is a 2-D tensor with shape [batch_size, 1].");
+    AddInput("Labels",
+             "The target value (Labels) of Hinge loss op."
+             "Labels is a 2-D tensor with shape [batch_size, 1].");
+    AddOutput("Loss",
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the hinge loss.");
+    AddComment(R"DOC(
+HingeLoss Operator.
+
+Let x be a logit (prediction) and y be the actual label. The logit can
+take any values from (-inf, inf), but the labels should be either -1 or 1.
+Then, the hinge loss is computed as follows:
+
+$$
+L_(x, y) = max(1 - y.x, 0) 
+$$
+
+Note that the labels passed as input will have values as either 0 or 1.
+
+)DOC");
+  }
+};
+
+class HingeLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
+                   "Input(Logits@GRAD) should not be null.");
+
+    auto pred_dims = ctx->GetInputDim("Logits");
+    auto lab_dims = ctx->GetInputDim("Labels");
+    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
+
+    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
+
+    auto pred_grad_name = framework::GradVarName("Logits");
+    ctx->SetOutputDim(pred_grad_name, pred_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
+            hinge_loss_grad, ops::HingeLossGradOp);
+REGISTER_OP_CPU_KERNEL(
+    hinge_loss,
+    ops::HingeLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    hinge_loss_grad,
+    ops::HingeLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/hinge_loss_op.cu b/paddle/operators/hinge_loss_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b9cfbc50c49c6cc902cb3667200c12c74fb5d13d
--- /dev/null
+++ b/paddle/operators/hinge_loss_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/hinge_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    hinge_loss,
+    ops::HingeLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    hinge_loss_grad,
+    ops::HingeLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/hinge_loss_op.h b/paddle/operators/hinge_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..91369cfb8a5d4f40be9e6249b50079ba2b550003
--- /dev/null
+++ b/paddle/operators/hinge_loss_op.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class HingeLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* pred = context.Input<framework::Tensor>("Logits");
+    auto* label = context.Input<framework::Tensor>("Labels");
+    auto* loss = context.Output<framework::Tensor>("Loss");
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    auto x = framework::EigenVector<T>::Flatten(*pred);
+    auto y = framework::EigenVector<T>::Flatten(*label);
+    loss->mutable_data<T>(context.GetPlace());
+    auto l = framework::EigenVector<T>::Flatten(*loss);
+    l.device(place) =
+        (static_cast<T>(1) - x * (static_cast<T>(2) * y - static_cast<T>(1)))
+            .cwiseMax(static_cast<T>(0));
+  }
+};
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class HingeLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* pred = context.Input<framework::Tensor>("Logits");
+    auto* label = context.Input<framework::Tensor>("Labels");
+    auto* dloss =
+        context.Input<framework::Tensor>(framework::GradVarName("Loss"));
+    auto* dpred =
+        context.Output<framework::Tensor>(framework::GradVarName("Logits"));
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    auto x = framework::EigenVector<T>::Flatten(*pred);
+    auto y = framework::EigenVector<T>::Flatten(*label);
+    auto dl = framework::EigenVector<T>::Flatten(*dloss);
+
+    if (dpred) {
+      dpred->mutable_data<T>(context.GetPlace());
+      auto dx = framework::EigenVector<T>::Flatten(*dpred);
+      auto alt_labels = static_cast<T>(2) * y - static_cast<T>(1);
+      dx.device(place) =
+          dl * ((x * alt_labels) < static_cast<T>(1)).template cast<T>() *
+          (-alt_labels);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/huber_loss_op.cc b/paddle/operators/huber_loss_op.cc
index 3435e74b0afb470fcbd1c0f4e06ad363352cac00..5c92f2c7b2d2f701bcc487716db41a0cce91002f 100644
--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/operators/huber_loss_op.cc
@@ -45,8 +45,7 @@ class HuberLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class HuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HuberLossOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  HuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input value of huber loss op."
@@ -70,11 +69,18 @@ input value and Y as the target value. Huber loss can evaluate the fitness of
 X to Y. Different from MSE loss, Huber loss is more robust for outliers. The
 shape of X and Y are [batch_size, 1]. The equation is:
 
-L_{\delta}(y, f(x)) =
+$$
+Out_{\delta}(X, Y)_i =
 \begin{cases}
-0.5 * (y - f(x))^2, \quad |y - f(x)| \leq \delta \\
-\delta * (|y - f(x)| - 0.5 * \delta),   \quad otherwise
+0.5 * (Y_i - X_i)^2,
+\quad |Y_i - X_i| \leq \delta \\
+\delta * (|Y_i - X_i| - 0.5 * \delta),
+\quad otherwise
 \end{cases}
+$$
+
+In the above equation, $Out_\delta(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
+element of Out, X and Y.
 
 )DOC");
   }
@@ -117,8 +123,9 @@ class HuberLossGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
             huber_loss_grad, ops::HuberLossGradOp);
-REGISTER_OP_CPU_KERNEL(huber_loss,
-                       ops::HuberLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    huber_loss,
+    ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CPUPlace, float>);
+    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/huber_loss_op.cu b/paddle/operators/huber_loss_op.cu
index 317321dc6c495f6e9a8808d841c71bfa26b754d0..ccc83a16ba271f0aa879c5c075dce1932dd40494 100644
--- a/paddle/operators/huber_loss_op.cu
+++ b/paddle/operators/huber_loss_op.cu
@@ -1,23 +1,24 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/huber_loss_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(huber_loss,
-                       ops::HuberLossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    huber_loss,
+    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::GPUPlace, float>);
+    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/huber_loss_op.h b/paddle/operators/huber_loss_op.h
index 4e7bc5543226e19fe0d6190171cdd9c2b3d2d985..4dd20e8b080ab8bd2e61830241d64ee8546a80ec 100644
--- a/paddle/operators/huber_loss_op.h
+++ b/paddle/operators/huber_loss_op.h
@@ -41,7 +41,7 @@ struct HuberLossForward {
   T delta;
 };
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class HuberLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -50,7 +50,8 @@ class HuberLossKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>("Residual");
     auto* out1 = context.Output<Tensor>("Out");
     auto delta = static_cast<T>(context.Attr<AttrType>("delta"));
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto x = EigenVector<T>::Flatten(*in0);
     auto y = EigenVector<T>::Flatten(*in1);
@@ -85,7 +86,7 @@ struct HuberLossBackward {
   T delta;
 };
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class HuberLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -94,7 +95,8 @@ class HuberLossGradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     auto* out1 = context.Output<Tensor>(framework::GradVarName("Y"));
     auto delta = static_cast<T>(context.op().Attr<AttrType>("delta"));
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto residual = EigenVector<T>::Flatten(*in0);
     auto out_grad = EigenVector<T>::Flatten(*in1);
diff --git a/paddle/operators/increment_op.cc b/paddle/operators/increment_op.cc
index 35efb12932f1d61fdb511b4ee2cdab3891507c61..e0b80cc4e74429dee1b9a25e41b116970ad4de2a 100644
--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
 
@@ -52,7 +52,7 @@ class IncrementOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &out =
         *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
@@ -61,6 +61,8 @@ class IncrementOp : public framework::OperatorBase {
     out.Resize(x.dims());
     out.mutable_data(x.place(), x.type());
     float value = Attr<float>("step");
+    VLOG(10) << Output("Out") << " increase " << Input("X") << " with "
+             << value;
     framework::VisitDataType(framework::ToDataType(out.type()),
                              IncrementFunctor(x, &out, value));
   }
@@ -68,8 +70,7 @@ class IncrementOp : public framework::OperatorBase {
 
 class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  IncrementOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  IncrementOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The input tensor of increment operator");
     AddOutput("Out", "(Tensor) The output tensor of increment operator.");
@@ -92,13 +93,13 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
  public:
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("increment");
     grad_op->SetInput("X", Output("Out"));
     grad_op->SetOutput("Out", Input("X"));
     grad_op->SetAttr("step", -boost::get<float>(GetAttr("step")));
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/is_empty_op.cc b/paddle/operators/is_empty_op.cc
index 54fecf44e881b5c283c81580fd161da9808d253e..492ae48845aa5aa123989e62d07f5ae899af6193 100644
--- a/paddle/operators/is_empty_op.cc
+++ b/paddle/operators/is_empty_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
@@ -29,7 +29,7 @@ class IsEmptyOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     // get input
     auto *var = scope.FindVar(Input(kInput));
     PADDLE_ENFORCE_NOT_NULL(var);
@@ -47,8 +47,7 @@ class IsEmptyOp : public framework::OperatorBase {
 
 class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  IsEmptyOpProtoMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  IsEmptyOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(kInput, "(Tensor) Tensor which is to be checked.");
     AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not.");
diff --git a/paddle/operators/l1_norm_op.cc b/paddle/operators/l1_norm_op.cc
index 02ebf022968e95d0b20598d3c935fb51177c8841..1a5d6e19263325821dd220d8a31c0e34600b8220 100644
--- a/paddle/operators/l1_norm_op.cc
+++ b/paddle/operators/l1_norm_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/l1_norm_op.h"
 
@@ -48,7 +48,7 @@ class L1NormGradOp : public framework::OperatorWithKernel {
 
 class L1NormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  L1NormOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  L1NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The input of l1_norm op.");
     AddOutput("Out", "(Scalar) The output of l1_norm op.");
@@ -69,7 +69,8 @@ $$Out = \sum{|X|}$$
 namespace ops = paddle::operators;
 REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad,
             ops::L1NormGradOp);
-REGISTER_OP_CPU_KERNEL(l1_norm,
-                       ops::L1NormKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    l1_norm_grad, ops::L1NormGradKernel<paddle::platform::CPUPlace, float>);
+    l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    l1_norm_grad,
+    ops::L1NormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/l1_norm_op.cu b/paddle/operators/l1_norm_op.cu
index 1c206e04ccbb5f4c2cb9d45aef7bac17c62d55c5..7ecc774670a8480da46ac688f3635e04b1ab7c1f 100644
--- a/paddle/operators/l1_norm_op.cu
+++ b/paddle/operators/l1_norm_op.cu
@@ -1,22 +1,23 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/l1_norm_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(l1_norm,
-                       ops::L1NormKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    l1_norm_grad, ops::L1NormGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    l1_norm, ops::L1NormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    l1_norm_grad,
+    ops::L1NormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/l1_norm_op.h b/paddle/operators/l1_norm_op.h
index 3c60dc3dc7415f34ed9d238e6f41b197ec404883..086d42705dceaf0cbd35ce8e5115156a76a0b6e8 100644
--- a/paddle/operators/l1_norm_op.h
+++ b/paddle/operators/l1_norm_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 // Out = sum(abs(X))
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class L1NormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -30,14 +30,15 @@ class L1NormKernel : public framework::OpKernel<T> {
 
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto out = framework::EigenScalar<T>::From(*Out);
-    auto place = context.GetEigenDevice<Place>();
+    auto &place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     out.device(place) = x.abs().sum();
   }
 };
 
 // dX = dout * sign(X)
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class L1NormGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -52,7 +53,8 @@ class L1NormGradKernel : public framework::OpKernel<T> {
     auto x_eigen = framework::EigenVector<T>::Flatten(*x);
     auto d_out_eigen = framework::EigenVector<T>::Flatten(*d_out);
     auto dx_eigen = framework::EigenVector<T>::Flatten(*dx);
-    auto place = context.GetEigenDevice<Place>();
+    auto &place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<int, 1> x_dsize(x->numel());
     dx_eigen.device(place) = d_out_eigen.broadcast(x_dsize) * x_eigen.sign();
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
index 066bdf67aa037e9c25cfdfaff7ec8771eb59cde8..975e394c78db037a125adeb2c86e3c74dc0eb6f8 100644
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -19,8 +19,7 @@ namespace operators {
 
 class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LinearChainCRFOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  LinearChainCRFOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Emission",
              "(LoDTensor, default LoDTensor<float>) "
@@ -32,19 +31,19 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
              "[(D + 2) x D]. The learnable parameter for the linear_chain_crf "
              "operator. See more details in the operator's comments.");
     AddInput("Label",
-             "(LoDTensor, default LoDTensor<int>) A LoDTensor with shape "
+             "(LoDTensor, default LoDTensor<int64_t>) A LoDTensor with shape "
              "[N x 1], where N is the total element number in a mini-batch. "
              "The ground truth.");
     AddOutput(
         "Alpha",
         "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
-        "The forward vectors for the entire batch. Denote it as \f$\alpha\f$. "
-        "\f$\alpha$\f is a memo table used to calculate the normalization "
-        "factor in CRF. \f$\alpha[k, v]$\f stores the unnormalized "
+        "The forward vectors for the entire batch. Denote it as $\alpha$. "
+        "$\alpha$ is a memo table used to calculate the normalization "
+        "factor in CRF. $\alpha[k, v]$ stores the unnormalized "
         "probabilites of all possible unfinished sequences of tags that end at "
-        "position \f$k$\f with tag \f$v$\f. For each \f$k$\f, "
-        "\f$\alpha[k, v]$\f is a vector of length \f$D$\f with a component for "
-        "each tag value \f$v$\f. This vector is called a forward vecotr and "
+        "position $k$ with tag $v$. For each $k$, "
+        "$\alpha[k, v]$ is a vector of length $D$ with a component for "
+        "each tag value $v$. This vector is called a forward vecotr and "
         "will also be used in backward computations.")
         .AsIntermediate();
     AddOutput(
@@ -73,9 +72,9 @@ LinearChainCRF Operator.
 
 Conditional Random Field defines an undirected probabilistic graph with nodes
 denoting random variables and edges denoting dependencies between these
-variables. CRF learns the conditional probability \f$P(Y|X)\f$, where
-\f$X = (x_1, x_2, ... , x_n)\f$ are structured inputs and
-\f$Y = (y_1, y_2, ... , y_n)\f$ are labels for the inputs.
+variables. CRF learns the conditional probability $P(Y|X)$, where
+$X = (x_1, x_2, ... , x_n)$ are structured inputs and
+$Y = (y_1, y_2, ... , y_n)$ are labels for the inputs.
 
 Linear chain CRF is a special case of CRF that is useful for sequence labeling
 task. Sequence labeling tasks do not assume a lot of conditional
@@ -88,21 +87,22 @@ CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
 http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
 
 Equation:
-1. Denote Input(Emission) to this operator as \f$x\f$ here.
+1. Denote Input(Emission) to this operator as $x$ here.
 2. The first D values of Input(Transition) to this operator are for starting
-weights, denoted as \f$a\f$ here.
+weights, denoted as $a$ here.
 3. The next D values of Input(Transition) of this operator are for ending
-weights, denoted as \f$b\f$ here.
+weights, denoted as $b$ here.
 4. The remaning values of Input(Transition) are for transition weights,
-denoted as \f$w\f$ here.
-5. Denote Input(Label) as \f$s\f$ here.
-
-The probability of a sequence \f$s\f$ of length \f$L\f$ is defined as:
-\f$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
-                 + \sum_{l=1}^L x_{s_l}
-                 + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
-where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
-all possible sequences is \f$1\f$, and \f$x\f$ is the emission feature weight
+denoted as $w$ here.
+5. Denote Input(Label) as $s$ here.
+
+The probability of a sequence $s$ of length $L$ is defined as:
+$$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
+                + \sum_{l=1}^L x_{s_l}
+                + \sum_{l=2}^L w_{s_{l-1},s_l})$$
+
+where $Z$ is a normalization value so that the sum of $P(s)$ over
+all possible sequences is 1, and $x$ is the emission feature weight
 to the linear chain CRF.
 
 Finally, the linear chain CRF operator outputs the logarithm of the conditional
@@ -183,7 +183,7 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of computation kernel of linear_chain_crf
   // is determined by its input "Emission".
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
@@ -242,7 +242,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of output of the linear_chain_crf_grad
   // operator is determined by its input: gradients of LogLikelihood.
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(
@@ -260,9 +260,10 @@ REGISTER_OP(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFOpMaker,
             linear_chain_crf_grad, ops::LinearChainCRFGradOp);
 REGISTER_OP_CPU_KERNEL(
     linear_chain_crf,
-    ops::LinearChainCRFOpKernel<paddle::platform::CPUPlace, float>,
-    ops::LinearChainCRFOpKernel<paddle::platform::CPUPlace, double>);
+    ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     linear_chain_crf_grad,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUPlace, float>,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUPlace, double>);
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUDeviceContext,
+                                    double>);
diff --git a/paddle/operators/linear_chain_crf_op.cu b/paddle/operators/linear_chain_crf_op.cu
index 6fc8995f4c2ce05f89ffb58129695113f89159fa..da612510b4d45d8eefabe7de303e9fd0132c5f77 100644
--- a/paddle/operators/linear_chain_crf_op.cu
+++ b/paddle/operators/linear_chain_crf_op.cu
@@ -1,10 +1,10 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -16,11 +16,12 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     linear_chain_crf,
-    ops::LinearChainCRFOpKernel<paddle::platform::GPUPlace, float>,
-    ops::LinearChainCRFOpKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
+    ops::LinearChainCRFOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LinearChainCRFOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
     linear_chain_crf_grad,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::GPUPlace, float>,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::GPUPlace, double>);
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CUDADeviceContext,
+                                    double>);
diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h
index ddf73981751798c72cef08f2dd5c87580b45aec3..f502ebefde1fbd4b366f76d2915d94a23a124e5f 100644
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
@@ -50,7 +50,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LinearChainCRFOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -137,7 +137,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
         framework::make_ddim({static_cast<int64_t>(batch_size), 1}),
         platform::CPUPlace());
 
-    auto place = ctx.GetEigenDevice<platform::CPUPlace>();
+    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
     auto x = EigenMatrix<T>::From(*emission_weights);
     auto x_row_max = EigenMatrix<T>::From(emission_row_max);
     x_row_max.device(place) =
@@ -195,7 +196,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     auto copyLoDTensor = [](const platform::DeviceContext& ctx,
                             const LoDTensor& src, LoDTensor* dst) {
       dst->mutable_data<T>(src.dims(), platform::CPUPlace());
-      dst->CopyFrom(src, platform::CPUPlace(), ctx);
+      framework::Copy(src, platform::CPUPlace(), ctx, dst);
     };
 
     copyLoDTensor(ctx, emission_weights_src, emission_weights_dst);
@@ -203,8 +204,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
 
     transition_weights_dst->mutable_data<T>(transition_weights_src.dims(),
                                             platform::CPUPlace());
-    transition_weights_dst->CopyFrom(transition_weights_src,
-                                     platform::CPUPlace(), ctx);
+    framework::Copy(transition_weights_src, platform::CPUPlace(), ctx,
+                    transition_weights_dst);
   }
 
   void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx,
@@ -218,8 +219,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     // operators runs on GPU device.
     auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
                          Tensor* dst) {
-      dst->mutable_data<T>(platform::GPUPlace());
-      dst->CopyFrom(src, platform::GPUPlace(), ctx);
+      dst->mutable_data<T>(platform::CUDAPlace());
+      framework::Copy(src, platform::CUDAPlace(), ctx, dst);
     };
     copyTensor(ctx, emission_exps_src, emission_exps_dst);
     copyTensor(ctx, transition_exps_src, transition_exps_dst);
@@ -271,7 +272,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     ll -= std::log(sum);
     // Now ll is equal to -log(Z).
 
-    const int* lbl = label.data<int>();
+    const int64_t* lbl = label.data<int64_t>();
     PADDLE_ENFORCE_LT(
         static_cast<size_t>(*std::max_element(lbl, lbl + seq_length)), tag_num,
         "An invalid tag label that execesses the largest tag number.");
@@ -287,7 +288,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -359,8 +360,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     emission_grad->mutable_data<T>(platform::CPUPlace());
     if (transition_grad) {
       transition_grad->mutable_data<T>(platform::CPUPlace());
-      math::SetConstant<platform::CPUPlace, T>()(ctx.device_context(),
-                                                 transition_grad, 0.);
+      math::set_constant(ctx.device_context(), transition_grad, 0.);
     }
     // Now, all the inputs and outputs should be on the CPU memory.
 
@@ -384,10 +384,10 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
       Tensor one_seq_beta = beta.Slice(start_pos, end_pos);
       Tensor one_seq_emission_grad = emission_grad->Slice(start_pos, end_pos);
 
-      BackwardOneSequence(ctx.device_context(), ll_grad[i],
-                          one_seq_emission_exps, *transition_exps,
-                          one_seq_alpha, one_seq_label, &one_seq_beta,
-                          transition_grad, &one_seq_emission_grad);
+      BackwardOneSequence(
+          ctx.template device_context<platform::CPUDeviceContext>(), ll_grad[i],
+          one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label,
+          &one_seq_beta, transition_grad, &one_seq_emission_grad);
     }
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
@@ -410,12 +410,12 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     // Copy the inputs from GPU memory to CPU memory when this operators runs on
     // GPU device.
     label_dst->mutable_data<T>(label_src.dims(), platform::CPUPlace());
-    label_dst->CopyFrom(label_src, platform::CPUPlace(), ctx);
+    framework::Copy(label_src, platform::CPUPlace(), ctx, label_dst);
 
     auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
                          Tensor* dst) {
       dst->mutable_data<T>(src.dims(), platform::CPUPlace());
-      dst->CopyFrom(src, platform::CPUPlace(), ctx);
+      framework::Copy(src, platform::CPUPlace(), ctx, dst);
     };
     copyTensor(ctx, emission_exps_src, emission_exps_dst);
     copyTensor(ctx, transition_exps_src, transition_exps_dst);
@@ -433,23 +433,23 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor* src,
                          Tensor* dst) {
       if (src && dst) {
-        dst->mutable_data<T>(platform::GPUPlace());
-        dst->CopyFrom(*src, platform::GPUPlace(), ctx);
+        dst->mutable_data<T>(platform::CUDAPlace());
+        framework::Copy(*src, platform::CUDAPlace(), ctx, dst);
       }
     };
     copyTensor(ctx, emission_grad_src, emission_grad_dst);
     copyTensor(ctx, transition_grad_src, transition_grad_dst);
   }
 
-  void BackwardOneSequence(const platform::DeviceContext& ctx, const T ll_grad,
-                           const Tensor& emission_exps,
+  void BackwardOneSequence(const platform::CPUDeviceContext& ctx,
+                           const T ll_grad, const Tensor& emission_exps,
                            const Tensor& transition_exps, const Tensor& alpha,
                            const Tensor& label, Tensor* beta,
                            Tensor* transition_grad,
                            Tensor* emission_grad) const {
     const T* w_exps = transition_exps.data<T>();
     const T* x_exps = emission_exps.data<T>();
-    const int* label_value = label.data<int>();
+    const int64_t* label_value = label.data<int64_t>();
     T* beta_value = beta->data<T>();
 
     auto x_dims = emission_exps.dims();
@@ -481,7 +481,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     auto alpha_mat = EigenMatrix<T>::From(alpha);
     auto beta_mat = EigenMatrix<T>::From(*beta);
 
-    auto* place = ctx.GetEigenDevice<platform::CPUPlace>();
+    auto* place = ctx.eigen_device();
     auto prob = alpha_mat * beta_mat;
     auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
                        .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
index b71a33a6b1ce80b545e6d7a4020dafc941dc55d2..f886b423ac7cb89961d1fdb5c6d3776ccafcaf60 100644
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -1,20 +1,20 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fstream>
 
 #include "paddle/framework/op_registry.h"
-
-#include <fstream>
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -26,7 +26,7 @@ class LoadOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     std::ifstream fin(filename);
     PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
@@ -39,62 +39,10 @@ class LoadOp : public framework::OperatorBase {
 
     auto *tensor = out_var->GetMutable<framework::LoDTensor>();
 
-    uint32_t version;
-    fin.read(reinterpret_cast<char *>(&version), sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-    framework::TensorDesc desc;
-    {  // int32_t size
-       // proto buffer
-      int32_t size;
-      fin.read(reinterpret_cast<char *>(&size), sizeof(size));
-      std::unique_ptr<char[]> buf(new char[size]);
-      fin.read(reinterpret_cast<char *>(buf.get()), size);
-      PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
-                     "Cannot parse tensor desc");
-    }
-    {  // read tensor
-      std::vector<int64_t> dims;
-      dims.reserve(static_cast<size_t>(desc.dims().size()));
-      std::copy(desc.dims().begin(), desc.dims().end(),
-                std::back_inserter(dims));
-      tensor->Resize(framework::make_ddim(dims));
-
-      void *buf;
-      platform::Place cpu = platform::CPUPlace();
-      switch (desc.data_type()) {
-        case framework::FP32:
-          buf = tensor->mutable_data<float>(cpu);
-          break;
-        case framework::FP64:
-          buf = tensor->mutable_data<double>(cpu);
-          break;
-        case framework::INT32:
-          buf = tensor->mutable_data<int>(cpu);
-          break;
-        case framework::INT64:
-          buf = tensor->mutable_data<int64_t>(cpu);
-          break;
-        default:
-          PADDLE_THROW("DataType %d not supported", desc.data_type());
-      }
-      fin.read(static_cast<char *>(buf), tensor->memory_size());
-    }
-    {  // read lod
-      uint64_t lod_level;
-      fin.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-      auto &lod = *tensor->mutable_lod();
-      lod.resize(lod_level);
-      for (uint64_t i = 0; i < lod_level; ++i) {
-        uint64_t size;
-        fin.read(reinterpret_cast<char *>(&size), sizeof(size));
-        std::vector<size_t> tmp(size / sizeof(size_t));
-        fin.read(reinterpret_cast<char *>(tmp.data()),
-                 static_cast<std::streamsize>(size));
-        lod[i] = tmp;
-      }
-    }
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    DeserializeFromStream(fin, tensor, dev_ctx);
 
-    auto place = dev_ctx.GetPlace();
     if (platform::is_gpu_place(place)) {
       // copy CPU to GPU
       framework::LoDTensor cpu_tensor;
@@ -105,15 +53,14 @@ class LoadOp : public framework::OperatorBase {
       out_var->Clear();
       tensor = out_var->GetMutable<framework::LoDTensor>();
       tensor->set_lod(cpu_tensor.lod());
-      tensor->CopyFrom(cpu_tensor, place, dev_ctx);
+      Copy(cpu_tensor, place, dev_ctx, tensor);
     }
   }
 };
 
 class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoadOpProtoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  LoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Out", "(Tensor) The tensor need to be loaded");
     AddAttr<std::string>("file_path",
diff --git a/paddle/operators/lod_array_length_op.cc b/paddle/operators/lod_array_length_op.cc
index 80445eb575703be3354595672a4c064b30e0f18c..d2c52745cfdf8d0fdb168ef2d90e75a515c31015 100644
--- a/paddle/operators/lod_array_length_op.cc
+++ b/paddle/operators/lod_array_length_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
@@ -26,7 +26,7 @@ class LoDArrayLengthOp : public framework::OperatorBase {
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
     auto &out =
         *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
@@ -38,17 +38,20 @@ class LoDArrayLengthOp : public framework::OperatorBase {
 
 class LoDArrayLengthProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDArrayLengthProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  LoDArrayLengthProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(LoDTensorArray) The input tensor array.");
     AddOutput("Out", "(Tensor) 1x1 CPU Tensor of length, int64_t");
-    AddComment(R"DOC(Get the length of lod tensor array
+    AddComment(R"DOC(
+LoDArrayLength Operator.
 
-Out = len(X)
+This operator obtains the length of lod tensor array:
+
+$$Out = len(X)$$
 
 NOTE: The output is a CPU Tensor since the control variable should be only in
 CPU and the length of LoDTensorArray should be used as control variables.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/lod_rank_table_op.cc b/paddle/operators/lod_rank_table_op.cc
index f7d4db1947b83fecf57575e17fafe26795c92bdd..692b9bf3710d764eceafda8390eedb8590794ddf 100644
--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/operators/lod_rank_table_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/op_registry.h"
 namespace paddle {
@@ -24,19 +24,19 @@ class LoDRankTableOp : public framework::OperatorBase {
                  const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
     VLOG(10) << "Level = " << static_cast<size_t>(Attr<int>("level"));
     out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level")));
+    VLOG(10) << Input("X") << "'s lod information is " << *out;
   }
 };
 
 class LoDRankTableOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDRankTableOpProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  LoDRankTableOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(LoDTensor) input lod tensor, must contain lod information.");
@@ -63,11 +63,11 @@ class LoDRankTableInferShape : public framework::InferShapeBase {
 
 class LoDRankTableInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind &op_desc,
-                  framework::BlockDescBind *block) const override {
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
     for (auto &o : op_desc.Output("Out")) {
-      block->FindRecursiveOrCreateVar(o)->SetType(
-          framework::VarDesc::LOD_RANK_TABLE);
+      block->FindRecursiveOrCreateVar(o).SetType(
+          framework::proto::VarDesc::LOD_RANK_TABLE);
     }
   }
 };
diff --git a/paddle/operators/lod_reset_op.cc b/paddle/operators/lod_reset_op.cc
index 32831cb1e2cf188a507773ef1e00b22de98d82ab..3d7b15edcfece84ffea539591a4db2690bd82029 100644
--- a/paddle/operators/lod_reset_op.cc
+++ b/paddle/operators/lod_reset_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-       http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/lod_reset_op.h"
 
@@ -38,7 +38,7 @@ class LoDResetOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
@@ -48,8 +48,7 @@ class LoDResetOp : public framework::OperatorWithKernel {
 
 class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDResetOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  LoDResetOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(LoDTensor) The input tensor of lod_reset operator.");
     AddInput("TargetLoD",
@@ -98,7 +97,7 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
diff --git a/paddle/operators/lod_reset_op.cu b/paddle/operators/lod_reset_op.cu
index 5244a17c3aad01909e3b8cf5f4d5abf8a44edc7f..910866ea6330059f8e0b04e036e3b124e920b5c4 100644
--- a/paddle/operators/lod_reset_op.cu
+++ b/paddle/operators/lod_reset_op.cu
@@ -1,24 +1,25 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/lod_reset_op.h"
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(lod_reset,
-                       ops::LoDResetKernel<paddle::platform::GPUPlace, float>,
-                       ops::LoDResetKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
-    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::GPUPlace, float>,
-    ops::LoDResetGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lod_reset, ops::LoDResetKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lod_reset_grad,
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/lod_reset_op.h b/paddle/operators/lod_reset_op.h
index 2bb916ccee80c83a02ea429fe95f5fafc86ccfa6..c1bbba7a83a3d63a94c54390cacbcdf773ab6b98 100644
--- a/paddle/operators/lod_reset_op.h
+++ b/paddle/operators/lod_reset_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-       http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LoDResetKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -33,7 +33,8 @@ class LoDResetKernel : public framework::OpKernel<T> {
       auto* lod = lod_t->data<int>();
       if (platform::is_gpu_place(ctx.GetPlace())) {
         framework::Tensor lod_cpu;
-        lod_cpu.CopyFrom(*lod_t, platform::CPUPlace(), ctx.device_context());
+        framework::Copy(*lod_t, platform::CPUPlace(), ctx.device_context(),
+                        &lod_cpu);
         lod = lod_cpu.data<int>();
       }
       level0 = std::vector<int>(lod, lod + lod_t->numel());
@@ -64,7 +65,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LoDResetGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc
index 58af35564d83b9699af4f7783fb6367ff9590682..685a807a8acafd36f44161fb17e0e88070d0bf43 100644
--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -1,19 +1,21 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -31,16 +33,21 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
                      const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
-    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
-    auto &rank_table =
-        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
-    auto &out =
-        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
-
+           const platform::Place &place) const override {
+    auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
+                          Input("X"))
+                  .Get<framework::LoDTensor>();
+    auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")))
+                           .Get<framework::LoDRankTable>();
+    auto &out = *detail::Ref(scope.FindVar(Output("Out")))
+                     .GetMutable<framework::LoDTensorArray>();
     auto &items = rank_table.items();
     auto max_seq_len = items[0].length;
     auto rank_level = rank_table.level();
+
+    PADDLE_ENFORCE_LT(rank_level, x.lod().size(),
+                      "Input should be a LOD tensor, and size is at least %d",
+                      rank_level + 1);
     out.resize(max_seq_len);
     std::vector<std::vector<CopyRange>> copy_ranges(max_seq_len);
 
@@ -55,16 +62,13 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
         size_t start_idx = x.lod()[rank_level][item.index] + t;
         auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
             x.lod(), start_idx, start_idx + 1, rank_level + 1);
-
         auto &lod_length = lod_and_offset.first;
         framework::AppendLoD(&lod, lod_length);
-
         size_t start_offset = lod_and_offset.second.first;
         size_t end_offset = lod_and_offset.second.second;
         copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
       }
     }
-
     for (size_t i = 0; i < max_seq_len; ++i) {
       auto &ranges = copy_ranges[i];
       size_t height = std::accumulate(
@@ -81,11 +85,16 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
           continue;
         }
         // out[i][offset: offset+len] = x[each_range.begin: each_range.end]
-        out[i]
-            .Slice(static_cast<int>(offset), static_cast<int>(offset + len))
-            .CopyFrom(x.Slice(static_cast<int>(each_range.begin),
-                              static_cast<int>(each_range.end)),
-                      x.place(), dev_ctx);
+        auto slice = out[i].Slice(static_cast<int>(offset),
+                                  static_cast<int>(offset + len));
+
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(place);
+
+        framework::Copy(x.Slice(static_cast<int>(each_range.begin),
+                                static_cast<int>(each_range.end)),
+                        x.place(), dev_ctx, &slice);
         offset += len;
       }
     }
@@ -94,8 +103,7 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
 
 class LoDTensorToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDTensorToArrayOpProtoMaker(framework::OpProto *proto,
-                               framework::OpAttrChecker *op_checker)
+  LoDTensorToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "");
     AddInput("RankTable", "");
@@ -125,10 +133,10 @@ class LoDTensorToArrayInferShape : public framework::InferShapeBase {
 
 class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind &op_desc,
-                  framework::BlockDescBind *block) const override {
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
     for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
+      block->Var(out_var)->SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
     }
   }
 };
@@ -138,14 +146,14 @@ class LoDTensorToArrayGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("array_to_lod_tensor");
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetInput("RankTable", Input("RankTable"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/log_loss_op.cc b/paddle/operators/log_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f714945354c5668f58e273dc8d6c7c16d51ac17d
--- /dev/null
+++ b/paddle/operators/log_loss_op.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/log_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LogLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) must be initialized.");
+
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+
+    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
+                      "The rank of Input(Predicted) must be 2 and the shape is "
+                      "[batch_size, 1].");
+    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
+                      "Each row of Input(Predicted) contains a real value, "
+                      "so the 2nd dimension of Input(X) must be 1.");
+
+    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
+    ctx->ShareLoD("Predicted", "Loss");
+  }
+};
+
+template <typename AttrType>
+class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LogLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Predicted",
+             "The input value (Predicted) of Log loss op."
+             "Predicted is a 2-D tensor with shape [batch_size, 1].");
+    AddInput("Labels",
+             "The target value (Labels) of Log loss op."
+             "Labels is a 2-D tensor with shape [batch_size, 1].");
+    AddOutput("Loss",
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the log loss.");
+    AddAttr<AttrType>("epsilon", "Epsilon in log loss.");
+    AddComment(R"DOC(
+LogLoss Operator.
+
+Log loss is a loss function used for binary classification. Log Loss quantifies
+the accuracy of a classifier by penalising false classifications. Minimising the
+Log Loss is equivalent to maximising the accuracy of the classifier. We define
+Predicted as the values predicted by our model and Labels as the target ground
+truth value. Log loss can evaluate how close the predicted values are to the
+target. The shapes of Predicted and Labels are both [batch_size, 1].
+The equation is:
+
+$$
+Loss = - Labels * log(Predicted + \epsilon) -
+        (1 - Labels) * log(1 - Predicted + \epsilon)
+$$
+
+)DOC");
+  }
+};
+
+class LogLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")),
+                   "Output(Predicted@GRAD) should not be null.");
+
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
+    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
+
+    auto pred_grad_name = framework::GradVarName("Predicted");
+    ctx->SetOutputDim(pred_grad_name, pred_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
+            ops::LogLossGradOp);
+REGISTER_OP_CPU_KERNEL(
+    log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    log_loss_grad,
+    ops::LogLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/log_loss_op.cu b/paddle/operators/log_loss_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..be283e470052cc3a569be564ab4baa6bc5b75808
--- /dev/null
+++ b/paddle/operators/log_loss_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/log_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    log_loss_grad,
+    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/log_loss_op.h b/paddle/operators/log_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..743eddb74004b5e87ed9b8a6ccb1b8496b8548dc
--- /dev/null
+++ b/paddle/operators/log_loss_op.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class LogLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* loss_out = ctx.Output<Tensor>("Loss");
+
+    loss_out->mutable_data<T>(ctx.GetPlace());
+
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+
+    auto loss = EigenVector<T>::Flatten(*loss_out);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    loss.device(place) = (-(label * (prediction + epsilon).log()) -
+                          ((static_cast<T>(1) - label) *
+                           (static_cast<T>(1) - prediction + epsilon).log()));
+  }
+};
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class LogLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+
+    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
+
+    auto dl = EigenVector<T>::Flatten(*dloss);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    if (dpred) {
+      dpred->mutable_data<T>(ctx.GetPlace());
+      auto dx = framework::EigenVector<T>::Flatten(*dpred);
+      dx.device(place) = dl * (-(label / (prediction + epsilon)) +
+                               ((static_cast<T>(1) - label) /
+                                (static_cast<T>(1) - prediction + epsilon)));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/logical_op.cc b/paddle/operators/logical_op.cc
index a37582c1d840ac11f847d8743c824ef1aef0fd66..fedd325cf4f7b5a779d17e0259a16d5cf39b77b7 100644
--- a/paddle/operators/logical_op.cc
+++ b/paddle/operators/logical_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/logical_op.h"
 #include "paddle/framework/op_registry.h"
@@ -20,8 +20,7 @@ namespace operators {
 template <typename OpComment>
 class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BinaryLogicalOpProtoMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  BinaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     OpComment comment;
     AddInput("X",
@@ -45,8 +44,7 @@ Each element of Out is calculated by %s
 template <typename OpComment>
 class UnaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  UnaryLogicalOpProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  UnaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     OpComment comment;
     AddInput("X", string::Sprintf("(LoDTensor) Operand of %s operator",
@@ -101,9 +99,9 @@ class LogicalOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = OperatorWithKernel::GetKernelType(ctx);
+    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
     // LogicalOp kernel's device type is decided by input tensor place
     kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
     return kt;
@@ -139,15 +137,16 @@ class LogicalOp : public framework::OperatorWithKernel {
       ::paddle::operators::UnaryLogicalOpInferShape<_##op_type##Comment>, \
       ::paddle::framework::EmptyGradOpMaker);
 
-REGISTER_BINARY_LOGICAL_OP(logical_and, "Out = X && Y");
+REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$");
 REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU,
                                paddle::operators::LogicalAndFunctor);
-REGISTER_BINARY_LOGICAL_OP(logical_or, "Out = X && Y");
+REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$");
 REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU,
                                paddle::operators::LogicalOrFunctor);
-REGISTER_UNARY_LOGICAL_OP(logical_not, "Out = !X");
+REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
 REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
                               paddle::operators::LogicalNotFunctor);
-REGISTER_BINARY_LOGICAL_OP(logical_xor, "Out = (X || Y) && !(X && Y)");
+REGISTER_BINARY_LOGICAL_OP(logical_xor,
+                           "$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$");
 REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
                                paddle::operators::LogicalXorFunctor);
diff --git a/paddle/operators/logical_op.cu b/paddle/operators/logical_op.cu
index d41239b2ca43e7145ea56afcb0af69948838cc48..87f2287b8f11aabe8afe87776eff49295c1ea2ac 100644
--- a/paddle/operators/logical_op.cu
+++ b/paddle/operators/logical_op.cu
@@ -1,24 +1,24 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/logical_op.h"
 
-REGISTER_BINARY_LOGICAL_KERNEL(logical_and, GPU,
+REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA,
                                paddle::operators::LogicalAndFunctor);
-REGISTER_BINARY_LOGICAL_KERNEL(logical_or, GPU,
+REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CUDA,
                                paddle::operators::LogicalOrFunctor);
-REGISTER_UNARY_LOGICAL_KERNEL(logical_not, GPU,
+REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CUDA,
                               paddle::operators::LogicalNotFunctor);
-REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, GPU,
+REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CUDA,
                                paddle::operators::LogicalXorFunctor);
diff --git a/paddle/operators/logical_op.h b/paddle/operators/logical_op.h
index 6e78a7d6ed87ba950886e6bc667f82118ff78904..413857685603c7b84e885135d9aadf7cc71a4f72 100644
--- a/paddle/operators/logical_op.h
+++ b/paddle/operators/logical_op.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <math.h>
@@ -47,7 +47,7 @@ struct LogicalXorFunctor {
   }
 };
 
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class BinaryLogicalOpKernel
     : public framework::OpKernel<typename Functor::ELEM_TYPE> {
  public:
@@ -57,14 +57,14 @@ class BinaryLogicalOpKernel
     auto* y = context.Input<framework::Tensor>("Y");
     auto* out = context.Output<framework::Tensor>("Out");
     Functor binary_func;
-    platform::Transform<Place> trans;
-    trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
-          y->data<T>(), out->mutable_data<bool>(context.GetPlace()),
-          binary_func);
+    platform::Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x->data<T>(),
+          x->data<T>() + x->numel(), y->data<T>(),
+          out->mutable_data<bool>(context.GetPlace()), binary_func);
   }
 };
 
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class UnaryLogicalOpKernel
     : public framework::OpKernel<typename Functor::ELEM_TYPE> {
  public:
@@ -73,8 +73,9 @@ class UnaryLogicalOpKernel
     auto* x = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
     Functor unary_func;
-    platform::Transform<Place> trans;
-    trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
+    platform::Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x->data<T>(),
+          x->data<T>() + x->numel(),
           out->mutable_data<bool>(context.GetPlace()), unary_func);
   }
 };
@@ -85,9 +86,9 @@ class UnaryLogicalOpKernel
 #define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \
   REGISTER_OP_##dev##_KERNEL(                                 \
       op_type, ::paddle::operators::BinaryLogicalOpKernel<    \
-                   ::paddle::platform::dev##Place, functor<bool>>);
+                   ::paddle::platform::dev##DeviceContext, functor<bool>>);
 
 #define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \
   REGISTER_OP_##dev##_KERNEL(                                \
       op_type, ::paddle::operators::UnaryLogicalOpKernel<    \
-                   ::paddle::platform::dev##Place, functor<bool>>);
+                   ::paddle::platform::dev##DeviceContext, functor<bool>>);
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index 93e812ac5be5aea6bf3ab353d31480322c51ccbc..bb03def4391da80c6219f7863d300fd3c8d8c7ac 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/lookup_table_op.h"
 #include "paddle/framework/var_type_inference.h"
@@ -41,7 +41,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
@@ -51,8 +51,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
 
 class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LookupTableOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  LookupTableOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("W",
              "An input represents embedding tensors, "
@@ -99,7 +98,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
@@ -109,19 +108,20 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
 
 class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind& op_desc,
-                  framework::BlockDescBind* block) const override {
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
     auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
     auto attr = op_desc.GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
               << " is set to SelectedRows";
-      block->Var(out_var_name)->SetType(framework::VarDesc::SELECTED_ROWS);
+      block->Var(out_var_name)
+          ->SetType(framework::proto::VarDesc::SELECTED_ROWS);
     } else {
       VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
               << " is set to LoDTensor";
-      block->Var(out_var_name)->SetType(framework::VarDesc::LOD_TENSOR);
+      block->Var(out_var_name)->SetType(framework::proto::VarDesc::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
index 84b044184a36a0d3a72a4105d6baf401b4774cf7..261a28da694bf551d8d9e630139680aebc4be51a 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -1,13 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
@@ -85,6 +88,8 @@ template <typename T>
 class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
+    auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
     bool is_sparse = context.Attr<bool>("is_sparse");
     if (is_sparse) {
       auto* ids = context.Input<LoDTensor>("Ids");
@@ -95,11 +100,11 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       auto* ids_data = ids->data<int64_t>();
       auto ids_dim = ids->dims();
 
-      auto stream = context.cuda_device_context().stream();
+      auto stream = dev_ctx.stream();
       // copy GPU memory to CPU pinned memory
       framework::Vector<int64_t> new_rows;
       new_rows.resize(ids_dim[0]);
-      auto gpu_place = boost::get<platform::GPUPlace>(context.GetPlace());
+      auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
 
       memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data,
                    ids_dim[0] * sizeof(int64_t), stream);
@@ -129,14 +134,11 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
 
       auto t = framework::EigenVector<T>::Flatten(*d_table_t);
-      t.device(context.GetEigenDevice<platform::GPUPlace>()) =
-          t.constant(static_cast<T>(0));
+      t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
 
       dim3 threads(128, 8);
       dim3 grids(8, 1);
-      LookupTableGrad<
-          T, 128, 8,
-          8><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+      LookupTableGrad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>(
           d_table, d_output, ids, N, K, D);
     }
   }
@@ -146,7 +148,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>,
-                       ops::LookupTableCUDAKernel<double>);
-REGISTER_OP_GPU_KERNEL(lookup_table_grad, ops::LookupTableGradCUDAKernel<float>,
-                       ops::LookupTableGradCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>,
+                        ops::LookupTableCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(lookup_table_grad,
+                        ops::LookupTableGradCUDAKernel<float>,
+                        ops::LookupTableGradCUDAKernel<double>);
diff --git a/paddle/operators/lookup_table_op.h b/paddle/operators/lookup_table_op.h
index 99b912163b71594340d8917645dff107fd208aea..2fd3335868406455ec01f9ded6bacc7bda5e2a67 100644
--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
@@ -1,13 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/lrn_op.cc b/paddle/operators/lrn_op.cc
index 00392b7967d020a7951a16a7850a2f08735baeb8..95673ba19e776b3c52eb492d0b14d761b584f807 100644
--- a/paddle/operators/lrn_op.cc
+++ b/paddle/operators/lrn_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/lrn_op.h"
 
@@ -19,6 +19,103 @@ namespace operators {
 
 using framework::Tensor;
 
+template <typename T>
+struct LRNFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta) {
+    auto x_v = framework::EigenVector<T>::Flatten(input);
+
+    const int start = -(n - 1) / 2;
+    const int end = start + n;
+
+    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
+    e_mid = e_mid.constant(k);
+
+    auto e_x = framework::EigenTensor<T, 4>::From(input);
+    for (int m = 0; m < N; m++) {
+      for (int i = 0; i < C; i++) {
+        for (int c = start; c <= end; c++) {
+          int ch = i + c;
+          if (ch >= 0 && ch < C) {
+            auto s = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+            auto r = e_x.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                               Eigen::array<int, 4>({{1, 1, H, W}}));
+
+            s += alpha * r.square();
+          }
+        }
+      }
+    }
+
+    auto out_e = framework::EigenVector<T>::Flatten(*out);
+    out_e = x_v * e_mid.reshape(Eigen::DSizes<int, 1>(e_mid.size())).pow(-beta);
+  }
+};
+template struct LRNFunctor<platform::CPUDeviceContext, float>;
+template struct LRNFunctor<platform::CPUDeviceContext, double>;
+
+template <typename T>
+struct LRNGradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta) {
+    T ratio = -2 * alpha * beta;
+    auto x_g_e = framework::EigenVector<T>::Flatten(*x_g);
+    x_g_e = x_g_e.constant(0.0);
+
+    auto e_x = framework::EigenTensor<T, 4>::From(x);
+    auto e_x_g = framework::EigenTensor<T, 4>::From(*x_g);
+    auto e_out = framework::EigenTensor<T, 4>::From(out);
+    auto e_out_g = framework::EigenTensor<T, 4>::From(out_g);
+    auto e_mid = framework::EigenTensor<T, 4>::From(mid);
+
+    const int start = -(n - 1) / 2;
+    const int end = start + n;
+    for (int m = 0; m < N; m++) {
+      for (int i = 0; i < C; i++) {
+        auto i_x = e_x.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                             Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_x_g = e_x_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                     Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_mid = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        i_x_g = i_mid.pow(-beta) * i_out_g;
+        for (int c = start; c <= end; c++) {
+          int ch = i + c;
+          if (ch < 0 || ch >= C) {
+            continue;
+          }
+
+          auto c_out = e_out.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                   Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          auto c_mid = e_mid.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                   Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          auto c_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                       Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          i_x_g += ratio * c_out_g * c_out * i_x / c_mid;
+        }
+      }
+    }
+  }
+};
+template struct LRNGradFunctor<platform::CPUDeviceContext, float>;
+template struct LRNGradFunctor<platform::CPUDeviceContext, double>;
+
 class LRNOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -43,7 +140,7 @@ class LRNOp : public framework::OperatorWithKernel {
 template <typename T>
 class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LRNOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  LRNOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor) The input of LRN operator. "
@@ -83,8 +180,8 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Local Response Normalization Operator.
 
-This operator comes from the paper
-"ImageNet Classification with Deep Convolutional Neural Networks".
+This operator comes from the paper:
+<<ImageNet Classification with Deep Convolutional Neural Networks>>.
 
 The original formula is:
 
@@ -107,7 +204,7 @@ Input(i, x, y), Output(i, x, y) represents an element in an image.
 C is the number of feature maps of one image. n is a hyper-parameter
 configured when operator is initialized. The sum in the denominator
 is the sum of the same positions in the neighboring maps.
-    
+
 )DOC");
   }
 };
@@ -119,8 +216,7 @@ class LRNOpGrad : public framework::OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("MidOut")),
-                   "Input(MidOut@GRAD) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("MidOut"), "Input(MidOut) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null");
 
@@ -134,6 +230,7 @@ class LRNOpGrad : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker<float>, lrn_grad, ops::LRNOpGrad);
-REGISTER_OP_CPU_KERNEL(lrn, ops::LRNKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(lrn_grad,
-                       ops::LRNGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    lrn, ops::LRNKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    lrn_grad, ops::LRNGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/lrn_op.cu b/paddle/operators/lrn_op.cu
index 607dc6d86a72b0a0c953f52782955dc530b7478c..eb9d66a73dfe3e22f1151d73ce5e34f2eda0835e 100644
--- a/paddle/operators/lrn_op.cu
+++ b/paddle/operators/lrn_op.cu
@@ -1,22 +1,178 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/operators/lrn_op.h"
 
-namespace ops = paddle::operators;
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void KeCMRNormFillScale(int img_size, const T* in, T* mid, int C,
+                                   int H, int W, int size, T k, T alpha) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < img_size) {
+    const int w = idx % W;
+    const int h = (idx / W) % H;
+    const int n = idx / W / H;
+    const int offset = (n * C * H + h) * W + w;
+
+    in += offset;
+    mid += offset;
+    const int step = H * W;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+
+    T accum = 0;
+    int index = 0;
+    while (index < C + post_pad) {
+      if (index < C) {
+        T val = in[index * step];
+        accum += val * val;
+      }
+      if (index >= size) {
+        T val = in[(index - size) * step];
+        accum -= val * val;
+      }
+      if (index >= post_pad) {
+        mid[(index - post_pad) * step] = k + accum * alpha;
+      }
+      ++index;
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeCMRNormOutput(int input_size, const T* in, const T* mid,
+                                T negative_beta, T* out) {
+  const int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < input_size) {
+    out[index] = in[index] * pow(mid[index], negative_beta);
+  }
+}
+
+template <typename T>
+void CrossMapNormal(const framework::ExecutionContext& ctx, const T* inputs,
+                    T* outputs, T* mid, int N, int C, int H, int W, int n, T k,
+                    T alpha, T beta) {
+  int img_size = N * H * W;
+  const int block_size = 1024;
+  int grid_size = (img_size + block_size - 1) / block_size;
+
+  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  KeCMRNormFillScale<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      img_size, inputs, mid, C, H, W, n, k, alpha);
+
+  int input_size = N * H * W * C;
+  grid_size = (input_size + block_size - 1) / block_size;
+  KeCMRNormOutput<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      input_size, inputs, mid, -beta, outputs);
+}
+
+template <typename T>
+struct LRNFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta) {
+    CrossMapNormal<T>(
+        ctx, input.data<T>(), out->mutable_data<T>(ctx.GetPlace()),
+        mid->mutable_data<T>(ctx.GetPlace()), N, C, H, W, n, k, alpha, beta);
+  }
+};
+
+template struct LRNFunctor<platform::CUDADeviceContext, float>;
+template struct LRNFunctor<platform::CUDADeviceContext, double>;
 
-REGISTER_OP_GPU_KERNEL(lrn, ops::LRNKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(lrn_grad,
-                       ops::LRNGradKernel<paddle::platform::GPUPlace, float>);
+template <typename T>
+__global__ void KeCMRNormDiff(int img_size, const T* x, const T* out,
+                              const T* mid, T* x_g, const T* out_g, int C,
+                              int H, int W, int size, T negative_beta,
+                              T ratio) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < img_size) {
+    const int w = idx % W;
+    const int h = (idx / W) % H;
+    const int n = idx / W / H;
+    const int offset = (n * C * H + h) * W + w;
+    x += offset;
+    out += offset;
+    mid += offset;
+    out_g += offset;
+    x_g += offset;
+
+    const int step = H * W;
+    const int pre_pad = size - (size + 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+
+    int index = 0;
+    T accum = 0;
+    // TODO(gongwb): optimize this with thread shared array.
+    while (index < C + post_pad) {
+      if (index < C) {
+        x_g[index * step] = 0.0;
+        accum += out_g[index * step] * out[index * step] / mid[index * step];
+      }
+      if (index >= size) {
+        accum -= out_g[(index - size) * step] * out[(index - size) * step] /
+                 mid[(index - size) * step];
+      }
+      if (index >= post_pad) {
+        x_g[(index - post_pad) * step] +=
+            out_g[(index - post_pad) * step] *
+                pow(mid[(index - post_pad) * step], negative_beta) -
+            ratio * x[(index - post_pad) * step] * accum;
+      }
+      ++index;
+    }
+  }
+}
+
+template <typename T>
+void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const T* x,
+                        const T* out, const T* mid, T* x_g, const T* out_g,
+                        int N, int C, int H, int W, int n, T alpha, T beta) {
+  int img_size = N * H * W;
+
+  const int block_size = 1024;
+  int grid_size = (img_size + block_size - 1) / block_size;
+
+  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  KeCMRNormDiff<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      img_size, x, out, mid, x_g, out_g, C, H, W, n, -beta,
+      2.0f * alpha * beta);
+}
+
+template <typename T>
+struct LRNGradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta) {
+    CrossMapNormalGrad<T>(ctx, x.data<T>(), out.data<T>(), mid.data<T>(),
+                          x_g->mutable_data<T>(ctx.GetPlace()), out_g.data<T>(),
+                          N, C, H, W, n, alpha, beta);
+  }
+};
+
+template struct LRNGradFunctor<platform::CUDADeviceContext, float>;
+template struct LRNGradFunctor<platform::CUDADeviceContext, double>;
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    lrn, ops::LRNKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    lrn_grad, ops::LRNGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/lrn_op.h b/paddle/operators/lrn_op.h
index 606c65744303b53846c9077dfa832bdbeedb410e..ef3a2883a88ff321fb9a87ddaf31123a3b9ee90a 100644
--- a/paddle/operators/lrn_op.h
+++ b/paddle/operators/lrn_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   You may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -21,7 +21,15 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename place, typename T>
+struct LRNFunctor {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta);
+};
+
+template <typename DeviceContext, typename T>
 class LRNKernel : public framework::OpKernel<T> {
  public:
   using Tensor = framework::Tensor;
@@ -31,8 +39,8 @@ class LRNKernel : public framework::OpKernel<T> {
   // f(x) represents outputs
   void Compute(const framework::ExecutionContext& ctx) const override {
     // input
-    const Tensor* x = ctx.Input<Tensor>("X");
-    auto x_dims = x->dims();
+    const Tensor& x = *ctx.Input<Tensor>("X");
+    auto x_dims = x.dims();
 
     // NCHW
     int N = x_dims[0];
@@ -57,38 +65,20 @@ class LRNKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0");
     PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0");
 
-    auto x_v = framework::EigenVector<T>::Flatten(*x);
-
-    const int start = -(n - 1) / 2;
-    const int end = start + n;
-
-    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
-    e_mid.device(ctx.GetEigenDevice<Place>()) = e_mid.constant(k);
-
-    auto e_x = framework::EigenTensor<T, 4>::From(*x);
-    for (int m = 0; m < N; m++) {
-      for (int i = 0; i < C; i++) {
-        for (int c = start; c <= end; c++) {
-          int ch = i + c;
-          if (ch >= 0 && ch < C) {
-            auto s = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-            auto r = e_x.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                               Eigen::array<int, 4>({{1, 1, H, W}}));
-
-            s.device(ctx.GetEigenDevice<Place>()) += alpha * r.square();
-          }
-        }
-      }
-    }
-
-    auto out_e = framework::EigenVector<T>::Flatten(*out);
-    out_e.device(ctx.GetEigenDevice<Place>()) =
-        x_v * e_mid.reshape(Eigen::DSizes<int, 1>(e_mid.size())).pow(-beta);
+    LRNFunctor<DeviceContext, T> f;
+    f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta);
   }
 };
 
+template <typename DeviceContext, typename T>
+struct LRNGradFunctor {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta);
+};
+
 /**
  * \brief Backward calculation for normalization with across maps.
  *
@@ -97,7 +87,7 @@ class LRNKernel : public framework::OpKernel<T> {
  * The implementation of this Function is derived from the
  * CrossMapNormalFunc implementation.
  *
- * InputGrad = OutputGrad * denoms ^ (-beta)
+ * InputGrad = OutputGrad * MidOut ^ (-beta)
  *    -- upper
  *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / MidOut) * InputValue
  *    -- lower
@@ -108,23 +98,20 @@ class LRNKernel : public framework::OpKernel<T> {
  * The upper and lower is the same as forward. The logic of the sum
  * is also the same as forward.
  */
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LRNGradKernel : public framework::OpKernel<T> {
  public:
   using Tensor = framework::Tensor;
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* out = ctx.Input<Tensor>("Out");
-    const Tensor* out_g = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    const Tensor* mid = ctx.Input<Tensor>("MidOut");
+    const Tensor& x = *ctx.Input<Tensor>("X");
+    const Tensor& out = *ctx.Input<Tensor>("Out");
+    const Tensor& out_g = *ctx.Input<Tensor>(framework::GradVarName("Out"));
+    const Tensor& mid = *ctx.Input<Tensor>("MidOut");
 
     auto x_g = ctx.Output<Tensor>(framework::GradVarName("X"));
     x_g->mutable_data<T>(ctx.GetPlace());
 
-    auto x_g_e = framework::EigenVector<T>::Flatten(*x_g);
-    x_g_e.device(ctx.GetEigenDevice<Place>()) = x_g_e.constant(0.0);
-
-    auto x_dims = x->dims();
+    auto x_dims = x.dims();
     int N = x_dims[0];
     int C = x_dims[1];
     int H = x_dims[2];
@@ -133,51 +120,9 @@ class LRNGradKernel : public framework::OpKernel<T> {
     int n = ctx.Attr<int>("n");
     T alpha = ctx.Attr<T>("alpha");
     T beta = ctx.Attr<T>("beta");
-    T ratio = -2 * alpha * beta;
-
-    auto e_x = framework::EigenTensor<T, 4>::From(*x);
-    auto e_x_g = framework::EigenTensor<T, 4>::From(*x_g);
-    auto e_out = framework::EigenTensor<T, 4>::From(*out);
-    auto e_out_g = framework::EigenTensor<T, 4>::From(*out_g);
-    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
-
-    const int start = -(n - 1) / 2;
-    const int end = start + n;
-    for (int m = 0; m < N; m++) {
-      for (int i = 0; i < C; i++) {
-        auto i_x = e_x.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                             Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_x_g = e_x_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                     Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_mid = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        i_x_g.device(ctx.GetEigenDevice<Place>()) = i_mid.pow(-beta) * i_out_g;
-        for (int c = start; c <= end; c++) {
-          int ch = i + c;
-          if (ch < 0 || ch >= C) {
-            continue;
-          }
-
-          auto c_out = e_out.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                   Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          auto c_mid = e_mid.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                   Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          auto c_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                       Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          i_x_g.device(ctx.GetEigenDevice<Place>()) +=
-              ratio * c_out_g * c_out * i_x / c_mid;
-        }
-      }
-    }
+
+    LRNGradFunctor<DeviceContext, T> f;
+    f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta);
   }
 };
 
diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc
index 4cbb60f3fdab968e8c36d4fbad55fd3efc7b1d0d..3b90b64b4effacf7240fb1bee8c0aa44251ad727 100644
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@@ -92,7 +92,7 @@ class LSTMOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
@@ -102,7 +102,7 @@ class LSTMOp : public framework::OperatorWithKernel {
 
 class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LSTMOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  LSTMOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Input",
              "(LoDTensor) the first input is a LodTensor, which support "
@@ -181,7 +181,7 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Long-Short Term Memory (LSTM) Operator.
 
-The defalut implementation is diagonal/peephole connection 
+The defalut implementation is diagonal/peephole connection
 (https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
 
 $$
@@ -198,27 +198,27 @@ c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
 h_t = o_t \odot act_h(c_t)
 $$
 
-where the W terms denote weight matrices (e.g. \f$W_{xi}\f$ is the matrix
-of weights from the input gate to the input), \f$W_{ic}, W_{fc}, W_{oc}\f$
+where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
+of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
 are diagonal weight matrices for peephole connections. In our implementation,
 we use vectors to reprenset these diagonal weight matrices. The b terms
-denote bias vectors (\f$b_i\f$ is the input gate bias vector), \f$\sigma\f$
+denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
 is the non-line activations, such as logistic sigmoid function, and
-\f$i, f, o\f$ and \f$c\f$ are the input gate, forget gate, output gate,
+$i, f, o$ and $c$ are the input gate, forget gate, output gate,
 and cell activation vectors, respectively, all of which have the same size as
-the cell output activation vector \f$h\f$.
+the cell output activation vector $h$.
 
-The \f$\odot\f$ is the element-wise product of the vectors. \f$act_g\f$ and \f$act_h\f$
+The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
 are the cell input and cell output activation functions and `tanh` is usually
-used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state,
+used for them. $\tilde{c_t}$ is also called candidate hidden state,
 which is computed based on the current input and the previous hidden state.
 
-Set `use_peepholes` False to disable peephole connection 
-(http://www.bioinf.jku.at/publications/older/2604.pdf). The formula
-is omitted here.
+Set `use_peepholes` False to disable peephole connection. The formula
+is omitted here, please refer to the paper
+http://www.bioinf.jku.at/publications/older/2604.pdf for details.
 
-Note that these \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
-operations on the input \f$x_{t}\f$ are NOT included in this operator.
+Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
+operations on the input $x_{t}$ are NOT included in this operator.
 Users can choose to use fully-connect operator before LSTM operator.
 
 )DOC");
@@ -260,7 +260,7 @@ class LSTMGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
@@ -273,8 +273,9 @@ class LSTMGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp);
-REGISTER_OP_CPU_KERNEL(lstm, ops::LSTMKernel<paddle::platform::CPUPlace, float>,
-                       ops::LSTMKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(lstm_grad,
-                       ops::LSTMGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::LSTMGradKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    lstm_grad, ops::LSTMGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/lstm_op.cu.cc b/paddle/operators/lstm_op.cu.cc
index 610cbb03e890203407b1489800bc17f1a196d12c..cfcc1fc92a074c9bfe83e6c32560177edef12ae9 100644
--- a/paddle/operators/lstm_op.cu.cc
+++ b/paddle/operators/lstm_op.cu.cc
@@ -1,22 +1,23 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/lstm_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(lstm, ops::LSTMKernel<paddle::platform::GPUPlace, float>,
-                       ops::LSTMKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(lstm_grad,
-                       ops::LSTMGradKernel<paddle::platform::GPUPlace, float>,
-                       ops::LSTMGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lstm, ops::LSTMKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lstm_grad, ops::LSTMGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h
index 721aa42c92f2926aabbc13d0a9027b2b4e573225..c57ee414dc5b3417549c8ac3a7fd57a9c8f452df 100644
--- a/paddle/operators/lstm_op.h
+++ b/paddle/operators/lstm_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/detail/activation_functions.h"
 #include "paddle/operators/math/lstm_compute.h"
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/sequence2batch.h"
@@ -24,16 +25,16 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
-inline void ReorderInitState(const platform::DeviceContext& ctx,
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
                              const framework::Tensor& src, const size_t* index,
                              framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<Place, T> row_shuffle;
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
   row_shuffle(ctx, src, index, *dst, indexed_src);
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LSTMKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -52,8 +53,8 @@ class LSTMKernel : public framework::OpKernel<T> {
     cell_out->mutable_data<T>(ctx.GetPlace());
 
     bool is_reverse = ctx.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<Place, T> to_batch;
-    auto& device_ctx = ctx.device_context();
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
     to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
 
     auto in_dims = input->dims();
@@ -64,7 +65,7 @@ class LSTMKernel : public framework::OpKernel<T> {
       Tensor b = *bias;
       b.Resize({bias->numel(), 1});
       Tensor gate_bias = b.Slice(0, 4 * frame_size);
-      math::RowwiseAdd<Place, T> add_bias;
+      math::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
     }
 
@@ -73,24 +74,24 @@ class LSTMKernel : public framework::OpKernel<T> {
       T* bias_data = const_cast<T*>(bias->data<T>());
       // the code style in LstmMetaValue will be updated later.
 
-      lstm_value.checkIg = bias_data + 4 * frame_size;
-      lstm_value.checkFg = lstm_value.checkIg + frame_size;
-      lstm_value.checkOg = lstm_value.checkFg + frame_size;
+      lstm_value.check_ig = bias_data + 4 * frame_size;
+      lstm_value.check_fg = lstm_value.check_ig + frame_size;
+      lstm_value.check_og = lstm_value.check_fg + frame_size;
     } else {
-      lstm_value.checkIg = nullptr;
-      lstm_value.checkFg = nullptr;
-      lstm_value.checkOg = nullptr;
+      lstm_value.check_ig = nullptr;
+      lstm_value.check_fg = nullptr;
+      lstm_value.check_og = nullptr;
     }
-    lstm_value.prevStateValue = nullptr;
+    lstm_value.prev_state_value = nullptr;
     Tensor ordered_c0;
     const size_t* order = batch_gate->lod()[2].data();
     if (cell_t0) {
       // Since the batch computing for LSTM reorders the input sequence
       // according to their length. The initialized cell state also needs
       // to reorder.
-      ReorderInitState<Place, T>(device_ctx, *cell_t0, order, &ordered_c0,
-                                 true);
-      lstm_value.prevStateValue = ordered_c0.data<T>();
+      ReorderInitState<DeviceContext, T>(device_ctx, *cell_t0, order,
+                                         &ordered_c0, true);
+      lstm_value.prev_state_value = ordered_c0.data<T>();
     }
 
     // Use the local variable as here.
@@ -102,9 +103,12 @@ class LSTMKernel : public framework::OpKernel<T> {
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
-    auto gate_act = ctx.Attr<std::string>("gate_activation");
-    auto cell_act = ctx.Attr<std::string>("cell_activation");
-    auto cand_act = ctx.Attr<std::string>("candidate_activation");
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
 
     for (size_t n = 0; n < num_batch; n++) {
       int bstart = static_cast<int>(batch_starts[n]);
@@ -121,9 +125,9 @@ class LSTMKernel : public framework::OpKernel<T> {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
-        math::matmul<Place, T>(device_ctx, pre_hidden_t, false, *weight, false,
-                               static_cast<T>(1.0), &gate_t,
-                               static_cast<T>(1.0));
+        math::matmul<DeviceContext, T>(device_ctx, pre_hidden_t, false, *weight,
+                                       false, static_cast<T>(1.0), &gate_t,
+                                       static_cast<T>(1.0));
       } else if (hidden_t0) {
         // If n == 0 and there is no initialized hidden state, that is to say
         // the H0 is zeros, the calculation W_h * H0 will be skiped.
@@ -133,24 +137,24 @@ class LSTMKernel : public framework::OpKernel<T> {
         // according to their length. The initialized hidden state also needs
         // to reorder.
         Tensor ordered_h0;
-        ReorderInitState<Place, T>(device_ctx, *hidden_t0, order, &ordered_h0,
-                                   true);
-        math::matmul<Place, T>(device_ctx, ordered_h0, false, *weight, false,
-                               static_cast<T>(1.0), &gate_t,
-                               static_cast<T>(1.0));
+        ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
+                                           &ordered_h0, true);
+        math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false, *weight,
+                                       false, static_cast<T>(1.0), &gate_t,
+                                       static_cast<T>(1.0));
       }
 
-      lstm_value.gateValue = gate_t.data<T>();
-      lstm_value.outputValue = out_t.data<T>();
-      lstm_value.stateValue = cell_t.data<T>();
-      lstm_value.stateActiveValue = cell_pre_act_t.data<T>();
-      math::LstmUnitFunctor<Place, T>::compute(device_ctx, lstm_value,
-                                               frame_size, cur_batch_size,
-                                               gate_act, cell_act, cand_act);
-      lstm_value.prevStateValue = lstm_value.stateValue;
+      lstm_value.gate_value = gate_t.data<T>();
+      lstm_value.output_value = out_t.data<T>();
+      lstm_value.state_value = cell_t.data<T>();
+      lstm_value.state_active_value = cell_pre_act_t.data<T>();
+      math::LstmUnitFunctor<DeviceContext, T>::compute(
+          device_ctx, lstm_value, frame_size, cur_batch_size, gate_act,
+          cell_act, cand_act);
+      lstm_value.prev_state_value = lstm_value.state_value;
     }
 
-    math::Batch2LoDTensorFunctor<Place, T> to_seq;
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_hidden.set_lod(batch_gate->lod());
     // restore the output hidden in LoDTensor from the batch hidden
     to_seq(device_ctx, batch_hidden, *hidden_out);
@@ -161,7 +165,7 @@ class LSTMKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LSTMGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -187,8 +191,8 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
     auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
 
-    auto& device_ctx = ctx.device_context();
-    math::SetConstant<Place, T> zero;
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
     if (weight_g) {
       weight_g->mutable_data<T>(ctx.GetPlace());
       zero(device_ctx, weight_g, static_cast<T>(0.0));
@@ -200,7 +204,8 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
     const size_t* order = batch_gate->lod()[2].data();
     if (c0) {
-      ReorderInitState<Place, T>(device_ctx, *c0, order, &ordered_c0, true);
+      ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
+                                         true);
     }
     if (c0 && c0_g) {
       ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
@@ -214,13 +219,13 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     math::LstmMetaValue<T> lstm_value;
     if (bias && ctx.Attr<bool>("use_peepholes")) {
       T* bias_data = const_cast<T*>(bias->data<T>());
-      lstm_value.checkIg = bias_data + 4 * frame_size;
-      lstm_value.checkFg = lstm_value.checkIg + frame_size;
-      lstm_value.checkOg = lstm_value.checkFg + frame_size;
+      lstm_value.check_ig = bias_data + 4 * frame_size;
+      lstm_value.check_fg = lstm_value.check_ig + frame_size;
+      lstm_value.check_og = lstm_value.check_fg + frame_size;
     } else {
-      lstm_value.checkIg = nullptr;
-      lstm_value.checkFg = nullptr;
-      lstm_value.checkOg = nullptr;
+      lstm_value.check_ig = nullptr;
+      lstm_value.check_fg = nullptr;
+      lstm_value.check_og = nullptr;
     }
 
     math::LstmMetaGrad<T> lstm_grad;
@@ -231,19 +236,19 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     }
     if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
       T* bias_g_data = bias_g->data<T>();
-      lstm_grad.checkIgGrad = bias_g_data + 4 * frame_size;
-      lstm_grad.checkFgGrad = lstm_grad.checkIgGrad + frame_size;
-      lstm_grad.checkOgGrad = lstm_grad.checkFgGrad + frame_size;
+      lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size;
+      lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size;
+      lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size;
     } else {
-      lstm_grad.checkIgGrad = nullptr;
-      lstm_grad.checkFgGrad = nullptr;
-      lstm_grad.checkOgGrad = nullptr;
+      lstm_grad.check_ig_grad = nullptr;
+      lstm_grad.check_fg_grad = nullptr;
+      lstm_grad.check_og_grad = nullptr;
     }
 
-    math::LoDTensor2BatchFunctor<Place, T> to_batch;
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
 
     auto ToBatch = [&batch_gate, &to_batch](
-        const platform::DeviceContext& ctx, const framework::LoDTensor& src,
+        const DeviceContext& ctx, const framework::LoDTensor& src,
         const framework::DDim& dims, framework::LoDTensor& dst) {
       dst.mutable_data<T>(dims, ctx.GetPlace());
       dst.set_lod(batch_gate->lod());
@@ -263,9 +268,12 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
     batch_gate_g.set_lod(batch_gate->lod());
 
-    auto gate_act = ctx.Attr<std::string>("gate_activation");
-    auto cell_act = ctx.Attr<std::string>("cell_activation");
-    auto cand_act = ctx.Attr<std::string>("candidate_activation");
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
@@ -276,30 +284,30 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       Tensor gate = batch_gate->Slice(bstart, bend);
       Tensor cell = batch_cell.Slice(bstart, bend);
       Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
-      lstm_value.gateValue = gate.data<T>();
-      lstm_value.stateValue = cell.data<T>();
-      lstm_value.stateActiveValue = cell_pre_act.data<T>();
+      lstm_value.gate_value = gate.data<T>();
+      lstm_value.state_value = cell.data<T>();
+      lstm_value.state_active_value = cell_pre_act.data<T>();
 
       Tensor out_g = batch_hidden_g.Slice(bstart, bend);
       Tensor gate_g = batch_gate_g.Slice(bstart, bend);
       Tensor cell_g = batch_cell_g.Slice(bstart, bend);
-      lstm_grad.stateGrad = cell_g.data<T>();
-      lstm_grad.gateGrad = gate_g.data<T>();
-      lstm_grad.outputGrad = out_g.data<T>();
+      lstm_grad.state_grad = cell_g.data<T>();
+      lstm_grad.gate_grad = gate_g.data<T>();
+      lstm_grad.output_grad = out_g.data<T>();
 
       if (n > 0) {
         int bstart_pre = static_cast<int>(batch_starts[n - 1]);
         Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
         Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
-        lstm_value.prevStateValue = cell_pre.data<T>();
-        lstm_grad.prevStateGrad = cell_pre_g.data<T>();
+        lstm_value.prev_state_value = cell_pre.data<T>();
+        lstm_grad.prev_state_grad = cell_pre_g.data<T>();
       } else {
-        lstm_value.prevStateValue = c0 ? ordered_c0.data<T>() : nullptr;
-        lstm_grad.prevStateGrad = c0_g ? ordered_c0_g.data<T>() : nullptr;
+        lstm_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
+        lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
       }
 
       int cur_batch_size = bend - bstart;
-      math::LstmUnitGradFunctor<Place, T>::compute(
+      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
           device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
           gate_act, cell_act, cand_act);
 
@@ -307,33 +315,34 @@ class LSTMGradKernel : public framework::OpKernel<T> {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end);
-        math::matmul<Place, T>(device_ctx, gate_g, false, *weight, true,
-                               static_cast<T>(1.0), &pre_hidden_g,
-                               static_cast<T>(1.0));
+        math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true,
+                                       static_cast<T>(1.0), &pre_hidden_g,
+                                       static_cast<T>(1.0));
         if (weight_g) {
           /* backward weight */
           auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end);
-          math::matmul<Place, T>(device_ctx, pre_hidden, true, gate_g, false,
-                                 static_cast<T>(1.0), weight_g,
-                                 static_cast<T>(1.0));
+          math::matmul<DeviceContext, T>(device_ctx, pre_hidden, true, gate_g,
+                                         false, static_cast<T>(1.0), weight_g,
+                                         static_cast<T>(1.0));
         }
       } else {
         if (h0 && weight_g) {
-          ReorderInitState<Place, T>(device_ctx, *h0, order, &ordered_h0, true);
-          math::matmul<Place, T>(device_ctx, ordered_h0, true, gate_g, false,
-                                 static_cast<T>(1.0), weight_g,
-                                 static_cast<T>(1.0));
+          ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
+                                             &ordered_h0, true);
+          math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true, gate_g,
+                                         false, static_cast<T>(1.0), weight_g,
+                                         static_cast<T>(1.0));
         }
         if (h0 && h0_g) {
           ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
-          math::matmul<Place, T>(device_ctx, gate_g, false, *weight, true,
-                                 static_cast<T>(1.0), &ordered_h0_g,
-                                 static_cast<T>(0.0));
+          math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
+                                         true, static_cast<T>(1.0),
+                                         &ordered_h0_g, static_cast<T>(0.0));
         }
       }
     }
 
-    math::Batch2LoDTensorFunctor<Place, T> to_seq;
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     if (in_g) {
       /* backward data */
       in_g->mutable_data<T>(ctx.GetPlace());
@@ -344,15 +353,17 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       Tensor b_g = *bias_g;
       b_g.Resize({bias_g->numel(), 1});
       Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
-      math::ColwiseSum<Place, T> col_sum;
+      math::ColwiseSum<DeviceContext, T> col_sum;
       col_sum(device_ctx, batch_gate_g, &gate_bias_g);
     }
 
     if (h0 && h0_g) {
-      ReorderInitState<Place, T>(device_ctx, ordered_h0_g, order, h0_g, false);
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_h0_g, order, h0_g,
+                                         false);
     }
     if (c0 && c0_g) {
-      ReorderInitState<Place, T>(device_ctx, ordered_c0_g, order, c0_g, false);
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_c0_g, order, c0_g,
+                                         false);
     }
   }
 };
diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/operators/lstm_unit_op.cc
index 18b9cdf2a39e8226c634194ff2cc56d169979774..c2d2c43982580c9724849d68576d42ffa44fc6b4 100644
--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/lstm_unit_op.h"
 
@@ -48,10 +48,12 @@ class LstmUnitOp : public framework::OperatorWithKernel {
 
 class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LstmUnitOpMaker(framework::OpProto* proto,
-                  framework::OpAttrChecker* op_checker)
+  LstmUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "FC input before the non-linear activation.");
+    AddInput("X",
+             "Lstm unit only applies non-linear activations, please make sure"
+             "that linear tranformation has already been applied to `X`. "
+             "Linear tranformation can be applied by adding a `fc` layer");
     AddInput(
         "C_prev",
         "The cell state tensor of last time-step in the Lstm Unit operator.");
diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu
index e192283aa0afac49e8e467506f3703d1ce60d2a6..5ee5ddd280f7720c4583053e4e48a5043ab423f4 100644
--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/operators/lstm_unit_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 /* Acknowledgement: the following code is strongly inspired by
 https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.cu
@@ -98,7 +98,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
 
     auto* x_tensor = ctx.Input<framework::Tensor>("X");
     auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
@@ -129,7 +129,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
 
     auto x_tensor = ctx.Input<Tensor>("X");
     auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
@@ -173,7 +173,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>,
-                       ops::LstmUnitOpCUDAKernel<double>);
-REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>,
-                       ops::LstmUnitGradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>,
+                        ops::LstmUnitOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>,
+                        ops::LstmUnitGradOpCUDAKernel<double>);
diff --git a/paddle/operators/lstm_unit_op.h b/paddle/operators/lstm_unit_op.h
index 38cb298f92a21bb5c7508761fec701d28279a85f..fa8d141bcb6ee4bfc9a29e337b7adbc5ecd3ad23 100644
--- a/paddle/operators/lstm_unit_op.h
+++ b/paddle/operators/lstm_unit_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 /* Acknowledgement: the following code is strongly inspired by
 https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op.h
@@ -35,7 +35,7 @@ inline T tanh(T x) {
   return 2. * sigmoid(2. * x) - 1.;
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LstmUnitKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -78,7 +78,7 @@ class LstmUnitKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LstmUnitGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/margin_rank_loss_op.cc b/paddle/operators/margin_rank_loss_op.cc
index d7e8a0ea7632650203106b01531d724cf0b8e085..e0df3077742bc330ce8510bf06b0411148f669d8 100644
--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/margin_rank_loss_op.h"
 
@@ -42,8 +42,7 @@ class MarginRankLossOp : public framework::OperatorWithKernel {
 template <typename T>
 class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MarginRankLossOpMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
+  MarginRankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X1",
              "(2-D tensor with shape [batch_size x 1]) The score for "
@@ -117,7 +116,7 @@ REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp,
             ops::MarginRankLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     margin_rank_loss,
-    ops::MarginRankLossKernel<paddle::platform::CPUPlace, float>);
+    ops::MarginRankLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     margin_rank_loss_grad,
-    ops::MarginRankLossGradKernel<paddle::platform::CPUPlace, float>);
+    ops::MarginRankLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/margin_rank_loss_op.cu b/paddle/operators/margin_rank_loss_op.cu
index 3a639f25d478a712c1030d57c57d7e55de1488b5..798c3ed182b08b07a779da88924bfc05743c680e 100644
--- a/paddle/operators/margin_rank_loss_op.cu
+++ b/paddle/operators/margin_rank_loss_op.cu
@@ -1,24 +1,24 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/margin_rank_loss_op.h"
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     margin_rank_loss,
-    ops::MarginRankLossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+    ops::MarginRankLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     margin_rank_loss_grad,
-    ops::MarginRankLossGradKernel<paddle::platform::GPUPlace, float>);
+    ops::MarginRankLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/margin_rank_loss_op.h b/paddle/operators/margin_rank_loss_op.h
index 8d0830147ecc465909e8988e90125929829f6f34..7438e881e1c69c9ef1f84b21e6ee0ba093f3378a 100644
--- a/paddle/operators/margin_rank_loss_op.h
+++ b/paddle/operators/margin_rank_loss_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -34,7 +34,7 @@ struct Heaviside {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MarginRankLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -56,13 +56,13 @@ class MarginRankLossKernel : public framework::OpKernel<T> {
     auto x1 = framework::EigenVector<T>::Flatten(*x1_t);
     auto x2 = framework::EigenVector<T>::Flatten(*x2_t);
 
-    auto& dev = ctx.GetEigenDevice<Place>();
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
     out.device(dev) = (-label * (x1 - x2) + margin).unaryExpr(ReLU<T>());
     act.device(dev) = out.unaryExpr(Heaviside<T>());
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MarginRankLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -78,7 +78,7 @@ class MarginRankLossGradKernel : public framework::OpKernel<T> {
     auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
     auto act = framework::EigenVector<T>::Flatten(*act_t);
     auto label = framework::EigenVector<T>::Flatten(*label_t);
-    auto& dev = ctx.GetEigenDevice<Place>();
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
 
     // compute d_x1
     if (d_x1_t) {
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 3017f133afc5d4dcd484c78b44591a876ab4d667..c607704efac86982c8c22e462381aaab488a9b69 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -9,12 +9,16 @@ if(WITH_GPU)
     nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
     nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
     nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
-    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
+    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
     nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
-    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
+    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor)
+    nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context)
+    nv_library(sequence_scale SRCS sequence_scale.cc sequence_scale.cu DEPS lod_tensor device_context)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
-    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
     nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
+    nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
+    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
+    nv_library(cos_sim_functor SRCS cos_sim_functor.cc cos_sim_functor.cu DEPS device_context)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
     cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
@@ -22,15 +26,20 @@ else()
     cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)
     cc_library(pooling SRCS pooling.cc DEPS device_context)
     cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
-    cc_library(vol2col SRCS vol2col.cc DEPS device_context)
+    cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
     cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
-    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
+    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor)
+    cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context)
+    cc_library(sequence_scale SRCS sequence_scale.cc DEPS lod_tensor device_context)
     cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
-    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
     cc_library(maxouting SRCS maxouting.cc DEPS device_context)
+    cc_library(unpooling SRCS unpooling.cc DEPS device_context)
+    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
+    cc_library(cos_sim_functor SRCS cos_sim_functor.cc DEPS device_context)
 endif()
 
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
 cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
 cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor)
+cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
diff --git a/paddle/operators/math/context_project.cc b/paddle/operators/math/context_project.cc
index f82ea5d7bee81fd1578c46f79477bb23939e627a..980dd90df8710cdbcb760e1ca84f1492a76fdb70 100644
--- a/paddle/operators/math/context_project.cc
+++ b/paddle/operators/math/context_project.cc
@@ -18,8 +18,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class ContextProjectFunctor<platform::CPUPlace, float>;
-template class ContextProjectFunctor<platform::CPUPlace, double>;
+template class ContextProjectFunctor<platform::CPUDeviceContext, float>;
+template class ContextProjectFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/context_project.cu b/paddle/operators/math/context_project.cu
index 04eeed543cb165fe449d3578a951cf74b0422252..934e3df645916013b4d1fe5eb4a19be924c914d5 100644
--- a/paddle/operators/math/context_project.cu
+++ b/paddle/operators/math/context_project.cu
@@ -20,8 +20,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class ContextProjectFunctor<platform::GPUPlace, float>;
-template class ContextProjectFunctor<platform::GPUPlace, double>;
+template class ContextProjectFunctor<platform::CUDADeviceContext, float>;
+template class ContextProjectFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/context_project.h b/paddle/operators/math/context_project.h
index 72f4202bace4461d2597204feaa2a21e355bd1ac..218de9fb9564fa0fa30227a6710f57bb89f1a2c8 100644
--- a/paddle/operators/math/context_project.h
+++ b/paddle/operators/math/context_project.h
@@ -81,17 +81,17 @@ using LoDTensor = framework::LoDTensor;
  *
  */
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ContextProjectFunctor {
  public:
-  void operator()(const platform::DeviceContext& context, const LoDTensor& in,
+  void operator()(const DeviceContext& context, const LoDTensor& in,
                   const Tensor& padding_data, bool padding_trainable,
                   const int context_start, const int context_length,
                   const int context_stride, const int up_pad,
                   const int down_pad, Tensor* col) {
     auto lod_level_0 = in.lod()[0];
 
-    math::Im2ColFunctor<math::ColFormat::kOCF, Place, float> im2col_ocf;
+    math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, float> im2col_ocf;
 
     std::vector<int> dilation({1, 1});
     std::vector<int> padding({up_pad, 0, down_pad, 0});
@@ -149,7 +149,7 @@ class ContextProjectFunctor {
             Tensor out_t_sub = out_t.Slice(k * context_length,
                                            k * context_length + padding_size);
             Tensor w_sub = padding_data.Slice(k, k + padding_size);
-            out_t_sub.CopyFrom(w_sub, context.GetPlace(), context);
+            framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub);
           }
         }
         if (down_pad > 0) {  // add down pad
@@ -179,7 +179,7 @@ class ContextProjectFunctor {
                 (down_pad_begin_row + t) * context_length);
             Tensor w_sub = padding_data.Slice(
                 up_pad + padding_idx, up_pad + padding_idx + padding_size);
-            out_t_sub.CopyFrom(w_sub, context.GetPlace(), context);
+            framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub);
           }
         }
         out_t.Resize({sequence_height, context_length * sequence_width});
@@ -188,17 +188,17 @@ class ContextProjectFunctor {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ContextProjectGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context, const LoDTensor& in,
+  void operator()(const DeviceContext& context, const LoDTensor& in,
                   bool padding_trainable, const int context_start,
                   const int context_length, const int context_stride,
                   const int up_pad, const int down_pad, bool pad_grad,
                   bool input_grad, Tensor* padding_data, Tensor* col) {
     auto lod_level_0 = in.lod()[0];
 
-    math::Col2ImFunctor<math::ColFormat::kOCF, Place, float> col2im_ocf;
+    math::Col2ImFunctor<math::ColFormat::kOCF, DeviceContext, float> col2im_ocf;
 
     std::vector<int> dilation({1, 1});
     std::vector<int> padding({up_pad, 0, down_pad, 0});
@@ -258,8 +258,8 @@ class ContextProjectGradFunctor {
               Tensor out_t_sub = out_t.Slice(k * context_length,
                                              k * context_length + padding_size);
               Tensor w_sub = padding_data->Slice(k, k + padding_size);
-              axpy<Place, T>(context, w_sub.numel(), static_cast<T>(1),
-                             out_t_sub.data<T>(), w_sub.data<T>());
+              axpy<DeviceContext, T>(context, w_sub.numel(), static_cast<T>(1),
+                                     out_t_sub.data<T>(), w_sub.data<T>());
             }
           }
           if (down_pad > 0) {
@@ -290,8 +290,8 @@ class ContextProjectGradFunctor {
                   (down_pad_begin_row + t) * context_length);
               Tensor w_sub = padding_data->Slice(
                   up_pad + padding_idx, up_pad + padding_idx + padding_size);
-              axpy<Place, T>(context, w_sub.numel(), static_cast<T>(1),
-                             out_t_sub.data<T>(), w_sub.data<T>());
+              axpy<DeviceContext, T>(context, w_sub.numel(), static_cast<T>(1),
+                                     out_t_sub.data<T>(), w_sub.data<T>());
             }
           }
           out_t.Resize({sequence_height, context_length * sequence_width});
diff --git a/paddle/operators/math/cos_sim_functor.cc b/paddle/operators/math/cos_sim_functor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6af9f0fcd967b4e8e9e338c155d5a8ee2866fbfa
--- /dev/null
+++ b/paddle/operators/math/cos_sim_functor.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/cos_sim_functor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct CosSimDyFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const T* x_norm,
+                  const T* y_norm, const T* x, const T* y, const T* z,
+                  const T* dz, const size_t rows, const size_t cols,
+                  T* dy) const {
+    for (size_t row_id = 0; row_id < rows; ++row_id) {
+      auto xy_norm_prod = x_norm[row_id] * y_norm[0];
+      auto dz_data = dz[row_id];
+      auto z_data = z[row_id];
+      auto* x_data = x + cols * row_id;
+      auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+
+      auto y_norm_square = y_norm[0] * y_norm[0];
+      auto reciprocal_y_norm_square = 1 / y_norm_square;
+      for (size_t i = 0; i < cols; ++i) {
+        dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod -
+                            z_data * y[i] * reciprocal_y_norm_square);
+      }
+    }
+  }
+};
+
+template struct CosSimDyFunctor<platform::CPUDeviceContext, float>;
+template struct CosSimDyFunctor<platform::CPUDeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/cos_sim_functor.cu b/paddle/operators/math/cos_sim_functor.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6eb0a4ea4c5b86f84c93b97615255adf55e9e042
--- /dev/null
+++ b/paddle/operators/math/cos_sim_functor.cu
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/cos_sim_functor.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+__global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x,
+                               const T* y, const T* z, const T* dz,
+                               const size_t rows, const size_t cols, T* dy) {
+  int grid_size = blockDim.x * gridDim.x;
+  T y_norm_data = y_norm[0];
+  for (int row_id = blockIdx.x * blockDim.x + threadIdx.x; row_id < rows;
+       row_id += grid_size) {
+    T xy_norm_prod = x_norm[row_id] * y_norm_data;
+    T dz_data = dz[row_id];
+    T z_data = z[row_id];
+    const T* x_data = x + cols * row_id;
+    T reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+
+    T y_norm_square = y_norm_data * y_norm_data;
+    T reciprocal_y_norm_square = 1 / y_norm_square;
+    for (size_t i = 0; i < cols; ++i) {
+      T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod -
+                             z_data * y[i] * reciprocal_y_norm_square);
+      platform::CudaAtomicAdd(dy + i, dy_data);
+    }
+  }
+}
+
+template <typename T>
+struct CosSimDyFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx, const T* x_norm,
+                  const T* y_norm, const T* x, const T* y, const T* z,
+                  const T* dz, const size_t rows, const size_t cols,
+                  T* dy) const {
+    const int block_size = 512;
+    dim3 threads(block_size, 1);
+    dim3 grid(1, (rows + block_size - 1) / block_size);
+    CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>(
+        x_norm, y_norm, x, y, z, dz, rows, cols, dy);
+  }
+};
+
+template struct CosSimDyFunctor<platform::CUDADeviceContext, float>;
+template struct CosSimDyFunctor<platform::CUDADeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/cos_sim_functor.h b/paddle/operators/math/cos_sim_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..aae8ab5b7a937c016e8a45e34b22aba7a1df3066
--- /dev/null
+++ b/paddle/operators/math/cos_sim_functor.h
@@ -0,0 +1,166 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include <stdlib.h>
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, bool same_row>
+struct CosSimFunctor {
+  CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto* x = x_ + cols_ * row_id;
+    T xx = 0, xy = 0, yy = 0;
+    if (same_row) {
+      auto* y = y_ + cols_ * row_id;
+      T tep_x, tep_y;
+      for (size_t i = 0; i < cols_; ++i) {
+        tep_x = x[i];
+        tep_y = y[i];
+        xx += tep_x * tep_x;
+        yy += tep_y * tep_y;
+        xy += tep_x * tep_y;
+      }
+      xx = sqrt(xx);
+      yy = sqrt(yy);
+      y_norm_[row_id] = yy;
+      x_norm_[row_id] = xx;
+      z_[row_id] = xy / (xx * yy);
+    } else {  // This can be wrote in a better way.
+      T tep_x, tep_y;
+      for (size_t i = 0; i < cols_; ++i) {
+        tep_x = x[i];
+        tep_y = y_[i];
+        xx += tep_x * tep_x;
+        yy += tep_y * tep_y;
+        xy += tep_x * tep_y;
+      }
+      xx = sqrt(xx);
+      yy = sqrt(yy);
+      if (row_id == 0) y_norm_[0] = yy;
+      x_norm_[row_id] = xx;
+      z_[row_id] = xy / (xx * yy);
+    }
+  }
+
+  T* x_norm_;
+  T* y_norm_;
+  const T* x_;
+  const T* y_;
+  T* z_;
+  const size_t cols_;
+};
+
+template <typename T>
+struct CosSimGradFunctor {
+  CosSimGradFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y,
+                    const T* z, const T* dz, T* dx, int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        dz_(dz),
+        dx_(dx),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
+    auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id];
+    auto dz = dz_[row_id];
+    auto z = z_[row_id];
+
+    auto* dx = dx_ + cols_ * row_id;
+    auto* x = x_ + cols_ * row_id;
+    auto* y = y_ + cols_ * row_id;
+
+    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+    auto reciprocal_x_norm_square = 1 / x_norm_square;
+    for (size_t i = 0; i < cols_; ++i) {
+      dx[i] = dz * (y[i] * reciprocal_xy_norm_prod -
+                    z * x[i] * reciprocal_x_norm_square);
+    }
+  }
+
+  const T* x_norm_;
+  const T* y_norm_;
+  const T* x_;
+  const T* y_;
+  const T* z_;
+  const T* dz_;
+  T* dx_;
+  const size_t cols_;
+};
+
+template <typename T>
+struct CosSimDxFunctor {
+  CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y,
+                  const T* z, const T* dz, T* dx, int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        dz_(dz),
+        dx_(dx),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto xy_norm_prod = x_norm_[row_id] * y_norm_[0];
+    auto dz = dz_[row_id];
+    auto z = z_[row_id];
+    auto* x = x_ + cols_ * row_id;
+    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
+    auto* dx = dx_ + cols_ * row_id;
+    auto reciprocal_x_norm_square = 1 / x_norm_square;
+
+    for (size_t i = 0; i < cols_; ++i) {
+      dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod -
+                    z * x[i] * reciprocal_x_norm_square);
+    }
+  }
+  const T* x_norm_;
+  const T* y_norm_;
+  const T* x_;
+  const T* y_;
+  const T* z_;
+  const T* dz_;
+  T* dx_;
+  const size_t cols_;
+};
+
+template <typename DeviceContext, typename T>
+struct CosSimDyFunctor {
+  void operator()(const DeviceContext& ctx, const T* x_norm, const T* y_norm,
+                  const T* x, const T* y, const T* z, const T* dz,
+                  const size_t rows, const size_t cols, T* dy) const;
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/cross_entropy.cc b/paddle/operators/math/cross_entropy.cc
index cf238a58e0a0b930077b0376a71dc02c5b31efe5..d9cb016fb440b6b2fe1d222812215feb5970dc4f 100644
--- a/paddle/operators/math/cross_entropy.cc
+++ b/paddle/operators/math/cross_entropy.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/math/cross_entropy.h"
 
@@ -24,9 +24,9 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename T>
-class CrossEntropyFunctor<platform::CPUPlace, T> {
+class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& ctx, framework::Tensor* out,
+  void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out,
                   const framework::Tensor* prob,
                   const framework::Tensor* labels, const bool softLabel) {
     const int batch_size = prob->dims()[0];
@@ -35,7 +35,7 @@ class CrossEntropyFunctor<platform::CPUPlace, T> {
       auto lbl = EigenMatrix<T>::From(*labels);
       auto loss = EigenMatrix<T>::From(*out);
 
-      loss.device(*ctx.GetEigenDevice<platform::CPUPlace>()) =
+      loss.device(*ctx.eigen_device()) =
           -((lbl * in.log().unaryExpr(math::TolerableValue<T>()))
                 .sum(Eigen::DSizes<int, 1>(1))
                 .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
@@ -53,8 +53,8 @@ class CrossEntropyFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class CrossEntropyFunctor<platform::CPUPlace, float>;
-template class CrossEntropyFunctor<platform::CPUPlace, double>;
+template class CrossEntropyFunctor<platform::CPUDeviceContext, float>;
+template class CrossEntropyFunctor<platform::CPUDeviceContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/cross_entropy.cu b/paddle/operators/math/cross_entropy.cu
index 651c08f740c2991b11c210c9bf012e505adc1835..16c9e7b28ec8d453492455c8d620ba9edf130a07 100644
--- a/paddle/operators/math/cross_entropy.cu
+++ b/paddle/operators/math/cross_entropy.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/math/cross_entropy.h"
 
@@ -95,10 +95,10 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
 using Tensor = framework::Tensor;
 
 template <typename T>
-class CrossEntropyFunctor<platform::GPUPlace, T> {
+class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& ctx, framework::Tensor* out,
-                  const framework::Tensor* prob,
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  framework::Tensor* out, const framework::Tensor* prob,
                   const framework::Tensor* labels, bool softLabel) {
     const T* prob_data = prob->data<T>();
     T* loss_data = out->mutable_data<T>(ctx.GetPlace());
@@ -118,16 +118,14 @@ class CrossEntropyFunctor<platform::GPUPlace, T> {
       const int64_t* label_data = labels->data<int64_t>();
       int block = 512;
       int grid = (batch_size + block - 1) / block;
-      CrossEntropyKernel<T><<<
-          grid, block, 0,
-          reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      CrossEntropyKernel<T><<<grid, block, 0, ctx.stream()>>>(
           loss_data, prob_data, label_data, batch_size, class_num);
     }
   }
 };
 
-template class CrossEntropyFunctor<platform::GPUPlace, float>;
-template class CrossEntropyFunctor<platform::GPUPlace, double>;
+template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
+template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/cross_entropy.h b/paddle/operators/math/cross_entropy.h
index 70ed9ddd551bb8cb7989727c02fea870186c9f2e..b3b6d767a8b8f59e3c75e72ac6c98653a8e1c3a4 100644
--- a/paddle/operators/math/cross_entropy.h
+++ b/paddle/operators/math/cross_entropy.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -33,11 +33,11 @@ struct TolerableValue {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CrossEntropyFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor* out, const framework::Tensor* prob,
+  void operator()(const DeviceContext& context, framework::Tensor* out,
+                  const framework::Tensor* prob,
                   const framework::Tensor* labels, const bool softLabel);
 };
 }  // namespace math
diff --git a/paddle/operators/math/detail/activation_functions.h b/paddle/operators/math/detail/activation_functions.h
index a20c35d1d9dc4a3a6fae92023fd1aae787a716ec..585a0123437a39c2b610306b18fe0a970c0ed072 100644
--- a/paddle/operators/math/detail/activation_functions.h
+++ b/paddle/operators/math/detail/activation_functions.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <math.h>
+#include "paddle/platform/enforce.h"
 #include "paddle/platform/hostdevice.h"
 
 #ifdef __AVX__
@@ -29,6 +30,26 @@ namespace detail {
 #define SIGMOID_THRESHOLD_MAX 13.0
 #define EXP_MAX_INPUT 40.0
 
+enum ActivationType {
+  kSigmoid,
+  kReLU,
+  kTanh,
+  kIdentity,
+};
+
+inline ActivationType GetActivationType(const std::string &type) {
+  if (type == "sigmoid") {
+    return ActivationType::kSigmoid;
+  } else if (type == "relu") {
+    return ActivationType::kReLU;
+  } else if (type == "tanh") {
+    return ActivationType::kTanh;
+  } else if (type == "identity" || type == "") {
+    return ActivationType::kIdentity;
+  }
+  PADDLE_THROW("Not support type %s.", type);
+}
+
 namespace forward {
 
 template <typename T>
diff --git a/paddle/operators/math/detail/gru_cpu_kernel.h b/paddle/operators/math/detail/gru_cpu_kernel.h
index 51af140cf4d5e6581765bea00033fa53d383230d..a61b232f4275d93cae1d9a71d49a779216c3555b 100644
--- a/paddle/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/operators/math/detail/gru_cpu_kernel.h
@@ -25,393 +25,395 @@ namespace detail {
 #ifndef __NVCC__
 
 template <class OpResetOutput, typename T>
-void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
-                                       T *gateValue, T *resetOutputValue,
-                                       T *prevOutputValue, int frameSize,
-                                       activation_mode_t active_gate) {
-  T rValueUpdateGate;
-  T rValueResetGate;
-  T rValueResetOutput;
-  T rPrevOut = 0;
-  T *updateGate = gateValue;
-  T *resetGate = gateValue + frameSize;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueResetGate = resetGate[i];
-    if (prevOutputValue) {
-      rPrevOut = prevOutputValue[i];
+void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                       T *gate_value, T *reset_output_value,
+                                       T *prev_output_value, int frame_size,
+                                       ActivationType active_gate) {
+  T r_value_update_gate;
+  T r_value_reset_gate;
+  T r_value_reset_output;
+  T r_prev_out = 0;
+  T *update_gate = gate_value;
+  T *reset_gate = gate_value + frame_size;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_reset_gate = reset_gate[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
     }
 
-    opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
-                  rValueResetOutput, active_gate);
+    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                    r_value_reset_output, active_gate);
 
-    updateGate[i] = rValueUpdateGate;
-    resetGate[i] = rValueResetGate;
-    resetOutputValue[i] = rValueResetOutput;
+    update_gate[i] = r_value_update_gate;
+    reset_gate[i] = r_value_reset_gate;
+    reset_output_value[i] = r_value_reset_output;
   }
 }
 
 template <class OpFinalOutput, typename T>
-void hl_naive_gru_forward_final_output(OpFinalOutput opFinalOutput,
-                                       T *gateValue, T *prevOutputValue,
-                                       T *outputValue, int frameSize,
-                                       activation_mode_t active_node) {
-  T rValueUpdateGate;
-  T rValueFrameState;
-  T rPrevOut = 0;
-  T rOutput;
-  T *updateGate = gateValue;
-  T *frameState = gateValue + frameSize * 2;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueFrameState = frameState[i];
-    if (prevOutputValue) {
-      rPrevOut = prevOutputValue[i];
+void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
+                                       T *gate_value, T *prev_output_value,
+                                       T *output_value, int frame_size,
+                                       ActivationType active_node) {
+  T r_value_update_gate;
+  T r_value_frame_state;
+  T r_prev_out = 0;
+  T r_output;
+  T *update_gate = gate_value;
+  T *frame_state = gate_value + frame_size * 2;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_frame_state = frame_state[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
     }
 
-    opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
-                  active_node);
+    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                    r_output, active_node);
 
-    frameState[i] = rValueFrameState;
-    outputValue[i] = rOutput;
+    frame_state[i] = r_value_frame_state;
+    output_value[i] = r_output;
   }
 }
 
 template <class OpResetOutput, typename T>
-void hl_avx_gru_forward_reset_output(OpResetOutput opResetOutput, T *gateValue,
-                                     T *resetOutputValue, T *prevOutputValue,
-                                     int frameSize,
-                                     activation_mode_t active_gate) {
+void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                     T *gate_value, T *reset_output_value,
+                                     T *prev_output_value, int frame_size,
+                                     ActivationType active_gate) {
 #ifdef __AVX__
-  __m256 rValueUpdateGate;
-  __m256 rValueResetGate;
-  __m256 rValueResetOutput;
-  __m256 rPrevOut = _mm256_set1_ps(0.0f);
-  __m256 *updateGate = (__m256 *)gateValue;
-  __m256 *resetGate = (__m256 *)(gateValue + frameSize);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueResetGate = resetGate[i];
-    if (prevOutputValue) {
-      rPrevOut = ((__m256 *)prevOutputValue)[i];
+  __m256 r_value_update_gate;
+  __m256 r_value_reset_gate;
+  __m256 r_value_reset_output;
+  __m256 r_prev_out = _mm256_set1_ps(0.0f);
+  __m256 *update_gate = (__m256 *)gate_value;
+  __m256 *reset_gate = (__m256 *)(gate_value + frame_size);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_reset_gate = reset_gate[i];
+    if (prev_output_value) {
+      r_prev_out = ((__m256 *)prev_output_value)[i];
     }
 
-    opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
-                  rValueResetOutput, active_gate);
+    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                    r_value_reset_output, active_gate);
 
-    updateGate[i] = rValueUpdateGate;
-    resetGate[i] = rValueResetGate;
-    ((__m256 *)resetOutputValue)[i] = rValueResetOutput;
+    update_gate[i] = r_value_update_gate;
+    reset_gate[i] = r_value_reset_gate;
+    ((__m256 *)reset_output_value)[i] = r_value_reset_output;
   }
 #endif
 }
 
 template <class OpFinalOutput, typename T>
-void hl_avx_gru_forward_final_output(OpFinalOutput opFinalOutput, T *gateValue,
-                                     T *prevOutputValue, T *outputValue,
-                                     int frameSize,
-                                     activation_mode_t active_node) {
+void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
+                                     T *gate_value, T *prev_output_value,
+                                     T *output_value, int frame_size,
+                                     ActivationType active_node) {
 #ifdef __AVX__
-  __m256 rValueUpdateGate;
-  __m256 rValueFrameState;
-  __m256 rPrevOut = _mm256_set1_ps(0.0f);
-  __m256 rOutput;
-  __m256 *updateGate = (__m256 *)gateValue;
-  __m256 *frameState = (__m256 *)(gateValue + frameSize * 2);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueFrameState = frameState[i];
-    if (prevOutputValue) {
-      rPrevOut = ((__m256 *)prevOutputValue)[i];
+  __m256 r_value_update_gate;
+  __m256 r_value_frame_state;
+  __m256 r_prev_out = _mm256_set1_ps(0.0f);
+  __m256 r_output;
+  __m256 *update_gate = (__m256 *)gate_value;
+  __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_frame_state = frame_state[i];
+    if (prev_output_value) {
+      r_prev_out = ((__m256 *)prev_output_value)[i];
     }
 
-    opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
-                  active_node);
+    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                    r_output, active_node);
 
-    frameState[i] = rValueFrameState;
-    ((__m256 *)outputValue)[i] = rOutput;
+    frame_state[i] = r_value_frame_state;
+    ((__m256 *)output_value)[i] = r_output;
   }
 #endif
 }
 
 template <class OpResetOutput, typename T>
-inline void forward_reset_output(OpResetOutput opResetOutput,
-                                 hl_gru_value<T> value, int frameSize,
-                                 int batchSize, activation_mode_t active_gate) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpResetOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+inline void forward_reset_output(OpResetOutput op_reset_output,
+                                 GRUMetaValue<T> value, int frame_size,
+                                 int batch_size, ActivationType active_gate) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
       hl_avx_gru_forward_reset_output(
-          opResetOutput, value.gateValue, value.resetOutputValue,
-          value.prevOutValue, frameSize, active_gate);
+          op_reset_output, value.gate_value, value.reset_output_value,
+          value.prev_out_value, frame_size, active_gate);
     } else {
       hl_naive_gru_forward_reset_output(
-          opResetOutput, value.gateValue, value.resetOutputValue,
-          value.prevOutValue, frameSize, active_gate);
+          op_reset_output, value.gate_value, value.reset_output_value,
+          value.prev_out_value, frame_size, active_gate);
     }
 
-    value.gateValue += frameSize * 3;
-    value.resetOutputValue += frameSize;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
+    value.gate_value += frame_size * 3;
+    value.reset_output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
     }
   }
 }
 
 template <class OpFinalOutput, typename T>
-inline void forward_final_output(OpFinalOutput opFinalOutput,
-                                 hl_gru_value<T> value, int frameSize,
-                                 int batchSize, activation_mode_t active_node) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpFinalOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_forward_final_output(opFinalOutput, value.gateValue,
-                                      value.prevOutValue, value.outputValue,
-                                      frameSize, active_node);
+inline void forward_final_output(OpFinalOutput op_final_output,
+                                 GRUMetaValue<T> value, int frame_size,
+                                 int batch_size, ActivationType active_node) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
+                                      value.prev_out_value, value.output_value,
+                                      frame_size, active_node);
     } else {
-      hl_naive_gru_forward_final_output(opFinalOutput, value.gateValue,
-                                        value.prevOutValue, value.outputValue,
-                                        frameSize, active_node);
+      hl_naive_gru_forward_final_output(
+          op_final_output, value.gate_value, value.prev_out_value,
+          value.output_value, frame_size, active_node);
     }
 
-    value.gateValue += frameSize * 3;
-    value.outputValue += frameSize;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
+    value.gate_value += frame_size * 3;
+    value.output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
     }
   }
 }
 
 template <class OpStateGrad, typename T>
-void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
-                                      T *gateGrad, T *prevOutValue,
-                                      T *prevOutGrad, T *outputGrad,
-                                      int frameSize,
-                                      activation_mode_t active_node) {
-  T rUpdateGateValue;
-  T rUpdateGateGrad;
-  T rFrameStateValue;
-  T rFrameStateGrad;
-  T rOutGrad;
-  T rPrevOutValue = 0;
-  T rPrevOutGrad = 0;
-  T *updateGateValue = gateValue;
-  T *updateGateGrad = gateGrad;
-  T *frameStateValue = gateValue + frameSize * 2;
-  T *frameStateGrad = gateGrad + frameSize * 2;
-
-  for (int i = 0; i < frameSize; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rFrameStateValue = frameStateValue[i];
-    rOutGrad = outputGrad[i];
-    if (prevOutValue) {
-      rPrevOutValue = prevOutValue[i];
+void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
+                                      T *gate_grad, T *prev_out_value,
+                                      T *prev_out_grad, T *output_grad,
+                                      int frame_size,
+                                      ActivationType active_node) {
+  T r_update_gate_value;
+  T r_update_gate_grad;
+  T r_frame_state_value;
+  T r_frame_state_grad;
+  T r_out_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T *update_gate_value = gate_value;
+  T *update_gate_grad = gate_grad;
+  T *frame_state_value = gate_value + frame_size * 2;
+  T *frame_state_grad = gate_grad + frame_size * 2;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_frame_state_value = frame_state_value[i];
+    r_out_grad = output_grad[i];
+    if (prev_out_value) {
+      r_prev_out_value = prev_out_value[i];
     }
-    if (prevOutGrad) {
-      rPrevOutGrad = prevOutGrad[i];
+    if (prev_out_grad) {
+      r_prev_out_grad = prev_out_grad[i];
     }
 
-    opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
-                rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
-                active_node);
+    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                  r_out_grad, active_node);
 
-    updateGateGrad[i] = rUpdateGateGrad;
-    frameStateGrad[i] = rFrameStateGrad;
-    if (prevOutGrad) {
-      prevOutGrad[i] = rPrevOutGrad;
+    update_gate_grad[i] = r_update_gate_grad;
+    frame_state_grad[i] = r_frame_state_grad;
+    if (prev_out_grad) {
+      prev_out_grad[i] = r_prev_out_grad;
     }
   }
 }
 
 template <class OpResetGrad, typename T>
-void hl_naive_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
-                                      T *gateGrad, T *prevOutValue,
-                                      T *prevOutGrad, T *resetOutputGrad,
-                                      int frameSize,
-                                      activation_mode_t active_gate) {
-  T rUpdateGateValue;
-  T rUpdateGateGrad;
-  T rResetGateValue;
-  T rResetGateGrad;
-  T rResetOutputGrad = 0;
-  T rPrevOutValue = 0;
-  T rPrevOutGrad = 0;
-  T *updateGateValue = gateValue;
-  T *updateGateGrad = gateGrad;
-  T *resetGateValue = gateValue + frameSize;
-  T *resetGateGrad = gateGrad + frameSize;
-
-  for (int i = 0; i < frameSize; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rUpdateGateGrad = updateGateGrad[i];
-    rResetGateValue = resetGateValue[i];
-
-    if (prevOutValue && prevOutGrad) {
-      rResetOutputGrad = resetOutputGrad[i];
+void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
+                                      T *gate_grad, T *prev_out_value,
+                                      T *prev_out_grad, T *reset_output_grad,
+                                      int frame_size,
+                                      ActivationType active_gate) {
+  T r_update_gate_value;
+  T r_update_gate_grad;
+  T r_reset_gate_value;
+  T r_reset_gate_grad;
+  T r_reset_output_grad = 0;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T *update_gate_value = gate_value;
+  T *update_gate_grad = gate_grad;
+  T *reset_gate_value = gate_value + frame_size;
+  T *reset_gate_grad = gate_grad + frame_size;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_update_gate_grad = update_gate_grad[i];
+    r_reset_gate_value = reset_gate_value[i];
+
+    if (prev_out_value && prev_out_grad) {
+      r_reset_output_grad = reset_output_grad[i];
     }
-    if (prevOutValue) {
-      rPrevOutValue = prevOutValue[i];
+    if (prev_out_value) {
+      r_prev_out_value = prev_out_value[i];
     }
-    if (prevOutGrad) {
-      rPrevOutGrad = prevOutGrad[i];
+    if (prev_out_grad) {
+      r_prev_out_grad = prev_out_grad[i];
     }
 
-    opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
-                rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
-                active_gate);
+    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                  r_reset_output_grad, active_gate);
 
-    updateGateGrad[i] = rUpdateGateGrad;
-    resetGateGrad[i] = rResetGateGrad;
-    if (prevOutGrad) {
-      prevOutGrad[i] = rPrevOutGrad;
+    update_gate_grad[i] = r_update_gate_grad;
+    reset_gate_grad[i] = r_reset_gate_grad;
+    if (prev_out_grad) {
+      prev_out_grad[i] = r_prev_out_grad;
     }
   }
 }
 
 template <class OpStateGrad, typename T>
-void hl_avx_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
-                                    T *gateGrad, T *prevOutValue,
-                                    T *prevOutGrad, T *outputGrad,
-                                    int frameSize,
-                                    activation_mode_t active_node) {
+void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
+                                    T *gate_grad, T *prev_out_value,
+                                    T *prev_out_grad, T *output_grad,
+                                    int frame_size,
+                                    ActivationType active_node) {
 #ifdef __AVX__
-  __m256 rUpdateGateValue;
-  __m256 rUpdateGateGrad;
-  __m256 rFrameStateValue;
-  __m256 rFrameStateGrad;
-  __m256 rOutGrad;
-  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
-  __m256 *updateGateValue = (__m256 *)gateValue;
-  __m256 *updateGateGrad = (__m256 *)gateGrad;
-  __m256 *frameStateValue = (__m256 *)(gateValue + frameSize * 2);
-  __m256 *frameStateGrad = (__m256 *)(gateGrad + frameSize * 2);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rFrameStateValue = frameStateValue[i];
-    rOutGrad = ((__m256 *)outputGrad)[i];
-    if (prevOutValue) {
-      rPrevOutValue = ((__m256 *)prevOutValue)[i];
+  __m256 r_update_gate_value;
+  __m256 r_update_gate_grad;
+  __m256 r_frame_state_value;
+  __m256 r_frame_state_grad;
+  __m256 r_out_grad;
+  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
+  __m256 *update_gate_value = (__m256 *)gate_value;
+  __m256 *update_gate_grad = (__m256 *)gate_grad;
+  __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2);
+  __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_frame_state_value = frame_state_value[i];
+    r_out_grad = ((__m256 *)output_grad)[i];
+    if (prev_out_value) {
+      r_prev_out_value = ((__m256 *)prev_out_value)[i];
     }
-    if (prevOutGrad) {
-      rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+    if (prev_out_grad) {
+      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
     }
 
-    opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
-                rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
-                active_node);
+    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                  r_out_grad, active_node);
 
-    updateGateGrad[i] = rUpdateGateGrad;
-    frameStateGrad[i] = rFrameStateGrad;
-    if (prevOutGrad) {
-      ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+    update_gate_grad[i] = r_update_gate_grad;
+    frame_state_grad[i] = r_frame_state_grad;
+    if (prev_out_grad) {
+      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
     }
   }
 #endif
 }
 
 template <class OpResetGrad, typename T>
-void hl_avx_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
-                                    T *gateGrad, T *prevOutValue,
-                                    T *prevOutGrad, T *resetOutputGrad,
-                                    int frameSize,
-                                    activation_mode_t active_gate) {
+void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
+                                    T *gate_grad, T *prev_out_value,
+                                    T *prev_out_grad, T *reset_output_grad,
+                                    int frame_size,
+                                    ActivationType active_gate) {
 #ifdef __AVX__
-  __m256 rUpdateGateValue;
-  __m256 rUpdateGateGrad;
-  __m256 rResetGateValue;
-  __m256 rResetGateGrad;
-  __m256 rResetOutputGrad = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
-  __m256 *updateGateValue = (__m256 *)gateValue;
-  __m256 *updateGateGrad = (__m256 *)gateGrad;
-  __m256 *resetGateValue = (__m256 *)(gateValue + frameSize);
-  __m256 *resetGateGrad = (__m256 *)(gateGrad + frameSize);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rUpdateGateGrad = updateGateGrad[i];
-    rResetGateValue = resetGateValue[i];
-
-    if (prevOutValue && prevOutGrad) {
-      rResetOutputGrad = ((__m256 *)resetOutputGrad)[i];
+  __m256 r_update_gate_value;
+  __m256 r_update_gate_grad;
+  __m256 r_reset_gate_value;
+  __m256 r_reset_gate_grad;
+  __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
+  __m256 *update_gate_value = (__m256 *)gate_value;
+  __m256 *update_gate_grad = (__m256 *)gate_grad;
+  __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size);
+  __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_update_gate_grad = update_gate_grad[i];
+    r_reset_gate_value = reset_gate_value[i];
+
+    if (prev_out_value && prev_out_grad) {
+      r_reset_output_grad = ((__m256 *)reset_output_grad)[i];
     }
-    if (prevOutValue) {
-      rPrevOutValue = ((__m256 *)prevOutValue)[i];
+    if (prev_out_value) {
+      r_prev_out_value = ((__m256 *)prev_out_value)[i];
     }
-    if (prevOutGrad) {
-      rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+    if (prev_out_grad) {
+      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
     }
 
-    opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
-                rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
-                active_gate);
+    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                  r_reset_output_grad, active_gate);
 
-    updateGateGrad[i] = rUpdateGateGrad;
-    resetGateGrad[i] = rResetGateGrad;
-    if (prevOutGrad) {
-      ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+    update_gate_grad[i] = r_update_gate_grad;
+    reset_gate_grad[i] = r_reset_gate_grad;
+    if (prev_out_grad) {
+      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
     }
   }
 #endif
 }
 
 template <class OpStateGrad, typename T>
-inline void backward_state_grad(OpStateGrad opStateGrad, hl_gru_value<T> value,
-                                hl_gru_grad<T> grad, int frameSize,
-                                int batchSize, activation_mode_t active_node) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpStateGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+inline void backward_state_grad(OpStateGrad op_state_grad,
+                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+                                int frame_size, int batch_size,
+                                ActivationType active_node) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
       hl_avx_gru_backward_state_grad(
-          opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
-          grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
     } else {
       hl_naive_gru_backward_state_grad(
-          opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
-          grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
     }
 
-    value.gateValue += frameSize * 3;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
+    value.gate_value += frame_size * 3;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
     }
 
-    grad.gateGrad += frameSize * 3;
-    grad.outputGrad += frameSize;
-    if (grad.prevOutGrad) {
-      grad.prevOutGrad += frameSize;
+    grad.gate_grad += frame_size * 3;
+    grad.output_grad += frame_size;
+    if (grad.prev_out_grad) {
+      grad.prev_out_grad += frame_size;
     }
   }
 }
 
 template <class OpResetGrad, typename T>
-inline void backward_reset_grad(OpResetGrad opResetGrad, hl_gru_value<T> value,
-                                hl_gru_grad<T> grad, int frameSize,
-                                int batchSize, activation_mode_t active_gate) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpResetGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+inline void backward_reset_grad(OpResetGrad op_reset_grad,
+                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+                                int frame_size, int batch_size,
+                                ActivationType active_gate) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
       hl_avx_gru_backward_reset_grad(
-          opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
-          grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
     } else {
       hl_naive_gru_backward_reset_grad(
-          opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
-          grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
     }
 
-    value.gateValue += frameSize * 3;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
+    value.gate_value += frame_size * 3;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
     }
 
-    grad.gateGrad += frameSize * 3;
-    grad.resetOutputGrad += frameSize;
-    if (grad.prevOutGrad) {
-      grad.prevOutGrad += frameSize;
+    grad.gate_grad += frame_size * 3;
+    grad.reset_output_grad += frame_size;
+    if (grad.prev_out_grad) {
+      grad.prev_out_grad += frame_size;
     }
   }
 }
diff --git a/paddle/operators/math/detail/gru_gpu_kernel.h b/paddle/operators/math/detail/gru_gpu_kernel.h
index 6441c648b048422c110872a85aa8cb719f11a8d7..1783d46096858c874b27ce75760342082835b180 100644
--- a/paddle/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/operators/math/detail/gru_gpu_kernel.h
@@ -19,182 +19,180 @@ limitations under the License. */
 #include "paddle/platform/cuda_helper.h"
 #include "paddle/platform/device_context.h"
 
-#include <glog/logging.h>
-
 namespace paddle {
 namespace operators {
 namespace math {
 namespace detail {
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class OpResetOutput, bool isBatch, typename T>
-__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
-                                        T *gateValue, T *resetOutputValue,
-                                        T *prevOutputValue, int frameSize,
-                                        int batchSize,
-                                        activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    resetOutputValue += batchIdx * frameSize;
+template <class OpResetOutput, bool is_batch, typename T>
+__global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
+                                        T *gate_value, T *reset_output_value,
+                                        T *prev_output_value, int frame_size,
+                                        int batch_size,
+                                        ActivationType active_gate) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    reset_output_value += batch_idx * frame_size;
   }
 
-  T rPrevOut = 0;
-  T rValueResetOutput;
-  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
-  T rValueResetGate = gateValue[frameIdx + frameSize * 1];
+  T r_prev_out = 0;
+  T r_value_reset_output;
+  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
+  T r_value_reset_gate = gate_value[frame_idx + frame_size * 1];
 
-  if (prevOutputValue) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
-    rPrevOut = prevOutputValue[frameIdx];
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    r_prev_out = prev_output_value[frame_idx];
   }
 
-  opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, rValueResetOutput,
-                active_gate);
+  op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                  r_value_reset_output, active_gate);
 
-  gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
-  gateValue[frameIdx + frameSize * 1] = rValueResetGate;
-  resetOutputValue[frameIdx] = rValueResetOutput;
+  gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
+  gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
+  reset_output_value[frame_idx] = r_value_reset_output;
 }
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class OpFinalOutput, bool isBatch, typename T>
-__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
-                                        T *gateValue, T *prevOutputValue,
-                                        T *outputValue, int frameSize,
-                                        int batchSize,
-                                        activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    outputValue += batchIdx * frameSize;
+template <class OpFinalOutput, bool is_batch, typename T>
+__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
+                                        T *gate_value, T *prev_output_value,
+                                        T *output_value, int frame_size,
+                                        int batch_size,
+                                        ActivationType active_node) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    output_value += batch_idx * frame_size;
   }
 
-  T rOutput;
-  T rPrevOut = 0;
-  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
-  T rValueFrameState = gateValue[frameIdx + frameSize * 2];
+  T r_output;
+  T r_prev_out = 0;
+  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
+  T r_value_frame_state = gate_value[frame_idx + frame_size * 2];
 
-  if (prevOutputValue) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
-    rPrevOut = prevOutputValue[frameIdx];
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    r_prev_out = prev_output_value[frame_idx];
   }
 
-  opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
-                active_node);
+  op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                  r_output, active_node);
 
-  gateValue[frameIdx + frameSize * 2] = rValueFrameState;
-  outputValue[frameIdx] = rOutput;
+  gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
+  output_value[frame_idx] = r_output;
 }
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class OpStateGrad, bool isBatch, typename T>
-__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad, T *gateValue,
-                                       T *gateGrad, T *prevOutValue,
-                                       T *prevOutGrad, T *outputGrad,
-                                       int frameSize, int batchSize,
-                                       activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    gateGrad += batchIdx * 3 * frameSize;
-    outputGrad += batchIdx * frameSize;
+template <class OpStateGrad, bool is_batch, typename T>
+__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
+                                       T *gate_grad, T *prev_out_value,
+                                       T *prev_out_grad, T *output_grad,
+                                       int frame_size, int batch_size,
+                                       ActivationType active_node) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    gate_grad += batch_idx * 3 * frame_size;
+    output_grad += batch_idx * frame_size;
   }
 
-  T rUpdateGateGrad;
-  T rFrameStateGrad;
-  T rPrevOutValue = 0;
-  T rPrevOutGrad = 0;
-  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
-  T rFrameStateValue = gateValue[frameIdx + frameSize * 2];
-  T rOutGrad = outputGrad[frameIdx];
+  T r_update_gate_grad;
+  T r_frame_state_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T r_frame_state_value = gate_value[frame_idx + frame_size * 2];
+  T r_out_grad = output_grad[frame_idx];
 
-  if (prevOutValue && prevOutGrad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
-    rPrevOutValue = prevOutValue[frameIdx];
+  if (prev_out_value && prev_out_grad) {
+    if (is_batch) prev_out_value += batch_idx * frame_size;
+    r_prev_out_value = prev_out_value[frame_idx];
 
-    if (isBatch) prevOutGrad += batchIdx * frameSize;
-    rPrevOutGrad = prevOutGrad[frameIdx];
+    if (is_batch) prev_out_grad += batch_idx * frame_size;
+    r_prev_out_grad = prev_out_grad[frame_idx];
   }
 
-  opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
-              rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
-              active_node);
+  op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                r_out_grad, active_node);
 
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
-  gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
-  if (prevOutGrad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
+  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
+  gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
+  if (prev_out_grad) {
+    prev_out_grad[frame_idx] = r_prev_out_grad;
   }
 }
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class OpResetGrad, bool isBatch, typename T>
-__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad, T *gateValue,
-                                       T *gateGrad, T *prevOutValue,
-                                       T *prevOutGrad, T *resetOutputGrad,
-                                       int frameSize, int batchSize,
-                                       activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    gateGrad += batchIdx * 3 * frameSize;
-    resetOutputGrad += batchIdx * frameSize;
+template <class OpResetGrad, bool is_batch, typename T>
+__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
+                                       T *gate_grad, T *prev_out_value,
+                                       T *prev_out_grad, T *reset_output_grad,
+                                       int frame_size, int batch_size,
+                                       ActivationType active_gate) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    gate_grad += batch_idx * 3 * frame_size;
+    reset_output_grad += batch_idx * frame_size;
   }
 
-  T rResetGateGrad;
-  T rPrevOutValue = 0;
-  T rPrevOutGrad = 0;
-  T rResetOutputGrad = 0;
-  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
-  T rUpdateGateGrad = gateGrad[frameIdx + frameSize * 0];
-  T rResetGateValue = gateValue[frameIdx + frameSize * 1];
-
-  if (prevOutValue && prevOutGrad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
-    if (isBatch) prevOutGrad += batchIdx * frameSize;
-    rPrevOutValue = prevOutValue[frameIdx];
-    rPrevOutGrad = prevOutGrad[frameIdx];
-    rResetOutputGrad = resetOutputGrad[frameIdx];
+  T r_reset_gate_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T r_reset_output_grad = 0;
+  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T r_update_gate_grad = gate_grad[frame_idx + frame_size * 0];
+  T r_reset_gate_value = gate_value[frame_idx + frame_size * 1];
+
+  if (prev_out_value && prev_out_grad) {
+    if (is_batch) prev_out_value += batch_idx * frame_size;
+    if (is_batch) prev_out_grad += batch_idx * frame_size;
+    r_prev_out_value = prev_out_value[frame_idx];
+    r_prev_out_grad = prev_out_grad[frame_idx];
+    r_reset_output_grad = reset_output_grad[frame_idx];
   }
 
-  opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
-              rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
-              active_gate);
+  op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                r_reset_output_grad, active_gate);
 
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
-  gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
-  if (prevOutGrad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
+  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
+  gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
+  if (prev_out_grad) {
+    prev_out_grad[frame_idx] = r_prev_out_grad;
   }
 }
 }  // namespace detail
diff --git a/paddle/operators/math/detail/gru_kernel.h b/paddle/operators/math/detail/gru_kernel.h
index 8a681d8d8bced72e1296f863489f6ccbc7913167..4d8245cb5d03b33edbda5d8350be02b4fa87ab95 100644
--- a/paddle/operators/math/detail/gru_kernel.h
+++ b/paddle/operators/math/detail/gru_kernel.h
@@ -28,23 +28,25 @@ namespace forward {
 template <typename T>
 class gru_resetOutput {
  public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueResetGate, T &prevOut,
-                             T &valueResetOutput, activation_mode_t actGate) {
-    valueUpdateGate = activation(valueUpdateGate, actGate);
-    valueResetGate = activation(valueResetGate, actGate);
-    valueResetOutput = prevOut * valueResetGate;
+  HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate,
+                             T &prev_out, T &value_reset_output,
+                             ActivationType act_gate) {
+    value_update_gate = activation(value_update_gate, act_gate);
+    value_reset_gate = activation(value_reset_gate, act_gate);
+    value_reset_output = prev_out * value_reset_gate;
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueResetGate,
-                             __m256 &prevOut, __m256 &valueResetOutput,
-                             activation_mode_t actGate) {
-    valueUpdateGate = activation(valueUpdateGate, actGate);
-    valueResetGate = activation(valueResetGate, actGate);
-    valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate);
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &value_reset_gate, __m256 &prev_out,
+                             __m256 &value_reset_output,
+                             ActivationType act_gate) {
+    value_update_gate = activation(value_update_gate, act_gate);
+    value_reset_gate = activation(value_reset_gate, act_gate);
+    value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate);
   }
 #endif
 #endif
@@ -53,24 +55,25 @@ class gru_resetOutput {
 template <typename T>
 class gru_finalOutput {
  public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueFrameState, T &prevOut,
-                             T &valueOutput, activation_mode_t actInput) {
-    valueFrameState = activation(valueFrameState, actInput);
-    valueOutput = prevOut - (valueUpdateGate * prevOut) +
-                  (valueUpdateGate * valueFrameState);
+  HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state,
+                             T &prev_out, T &value_output,
+                             ActivationType act_input) {
+    value_frame_state = activation(value_frame_state, act_input);
+    value_output = prev_out - (value_update_gate * prev_out) +
+                   (value_update_gate * value_frame_state);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueFrameState,
-                             __m256 &prevOut, __m256 &valueOutput,
-                             activation_mode_t actInput) {
-    valueFrameState = activation(valueFrameState, actInput);
-    valueOutput = _mm256_add_ps(
-        _mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)),
-        _mm256_mul_ps(valueUpdateGate, valueFrameState));
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &value_frame_state, __m256 &prev_out,
+                             __m256 &value_output, ActivationType act_input) {
+    value_frame_state = activation(value_frame_state, act_input);
+    value_output = _mm256_add_ps(
+        _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)),
+        _mm256_mul_ps(value_update_gate, value_frame_state));
   }
 #endif
 #endif
@@ -82,34 +85,37 @@ namespace backward {
 template <typename T>
 class gru_stateGrad {
  public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
-                             T &valueFrameState, T &gradFrameState,
-                             T &valuePrevOut, T &gradPrevOut, T &gradOutput,
-                             activation_mode_t actInput) {
-    gradUpdateGate = (gradOutput * valueFrameState);
-    gradUpdateGate -= (gradOutput * valuePrevOut);
-    gradPrevOut -= (gradOutput * valueUpdateGate);
-    gradPrevOut += gradOutput;
-    gradFrameState =
-        activation(gradOutput * valueUpdateGate, valueFrameState, actInput);
+  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
+                             T &value_frame_state, T &grad_frame_state,
+                             T &value_prev_out, T &grad_prev_out,
+                             T &grad_output, ActivationType act_input) {
+    grad_update_gate = (grad_output * value_frame_state);
+    grad_update_gate -= (grad_output * value_prev_out);
+    grad_prev_out -= (grad_output * value_update_gate);
+    grad_prev_out += grad_output;
+    grad_frame_state = activation(grad_output * value_update_gate,
+                                  value_frame_state, act_input);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
-                             __m256 &valueFrameState, __m256 &gradFrameState,
-                             __m256 &valuePrevOut, __m256 &gradPrevOut,
-                             __m256 &gradOutput, activation_mode_t actInput) {
-    gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState);
-    gradUpdateGate =
-        _mm256_sub_ps(gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut));
-    gradPrevOut = _mm256_add_ps(
-        _mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)),
-        gradOutput);
-    gradFrameState = activation(_mm256_mul_ps(gradOutput, valueUpdateGate),
-                                valueFrameState, actInput);
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &grad_update_gate,
+                             __m256 &value_frame_state,
+                             __m256 &grad_frame_state, __m256 &value_prev_out,
+                             __m256 &grad_prev_out, __m256 &grad_output,
+                             ActivationType act_input) {
+    grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state);
+    grad_update_gate = _mm256_sub_ps(
+        grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out));
+    grad_prev_out = _mm256_add_ps(
+        _mm256_sub_ps(grad_prev_out,
+                      _mm256_mul_ps(grad_output, value_update_gate)),
+        grad_output);
+    grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate),
+                                  value_frame_state, act_input);
   }
 #endif
 #endif
@@ -118,30 +124,32 @@ class gru_stateGrad {
 template <typename T>
 class gru_resetGrad {
  public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
-                             T &valueResetGate, T &gradResetGate,
-                             T &valuePrevOut, T &gradPrevOut,
-                             T &gradResetOutput, activation_mode_t actGate) {
-    gradResetGate = (gradResetOutput * valuePrevOut);
-    gradPrevOut += (gradResetOutput * valueResetGate);
-    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
-    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
+                             T &value_reset_gate, T &grad_reset_gate,
+                             T &value_prev_out, T &grad_prev_out,
+                             T &grad_reset_output, ActivationType act_gate) {
+    grad_reset_gate = (grad_reset_output * value_prev_out);
+    grad_prev_out += (grad_reset_output * value_reset_gate);
+    grad_update_gate =
+        activation(grad_update_gate, value_update_gate, act_gate);
+    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
-                             __m256 &valueResetGate, __m256 &gradResetGate,
-                             __m256 &valuePrevOut, __m256 &gradPrevOut,
-                             __m256 &gradResetOutput,
-                             activation_mode_t actGate) {
-    gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut);
-    gradPrevOut = _mm256_add_ps(gradPrevOut,
-                                _mm256_mul_ps(gradResetOutput, valueResetGate));
-    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
-    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &grad_update_gate, __m256 &value_reset_gate,
+                             __m256 &grad_reset_gate, __m256 &value_prev_out,
+                             __m256 &grad_prev_out, __m256 &grad_reset_output,
+                             ActivationType act_gate) {
+    grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out);
+    grad_prev_out = _mm256_add_ps(
+        grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate));
+    grad_update_gate =
+        activation(grad_update_gate, value_update_gate, act_gate);
+    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
   }
 #endif
 #endif
diff --git a/paddle/operators/math/detail/lstm_cpu_kernel.h b/paddle/operators/math/detail/lstm_cpu_kernel.h
index fc3ad0ce58aa1552ef7e717fb529c2d454b4895a..42888fcdb0a464892e3007ee73c195fcd2a431bb 100644
--- a/paddle/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_cpu_kernel.h
@@ -26,278 +26,281 @@ namespace detail {
 
 template <class T, class Op>
 void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                     int frameSize,
-                                     activation_mode_t active_node,
-                                     activation_mode_t active_gate,
-                                     activation_mode_t active_state) {
-  T rValueIn;
-  T rValueIg;
-  T rValueFg;
-  T rValueOg;
-  T rCheckI;
-  T rCheckF;
-  T rCheckO;
-  T rState;
-  T rPrevState = 0;
-  T rStateAtv;
-  T rOut;
-
-  T *valueIn = value.gateValue;
-  T *valueIg = value.gateValue + frameSize;
-  T *valueFg = value.gateValue + frameSize * 2;
-  T *valueOg = value.gateValue + frameSize * 3;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueIn = valueIn[i];
-    rValueIg = valueIg[i];
-    rValueFg = valueFg[i];
-    rValueOg = valueOg[i];
-    rCheckI = value.checkIg ? value.checkIg[i] : 0;
-    rCheckF = value.checkFg ? value.checkFg[i] : 0;
-    rCheckO = value.checkOg ? value.checkOg[i] : 0;
-
-    if (value.prevStateValue) {
-      rPrevState = value.prevStateValue[i];
+                                     int frame_size, ActivationType active_node,
+                                     ActivationType active_gate,
+                                     ActivationType active_state) {
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+  T r_checkI;
+  T r_checkF;
+  T r_checkO;
+  T r_state;
+  T r_prev_state = 0;
+  T r_state_atv;
+  T r_out;
+
+  T *value_in = value.gate_value;
+  T *value_ig = value.gate_value + frame_size;
+  T *value_fg = value.gate_value + frame_size * 2;
+  T *value_og = value.gate_value + frame_size * 3;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    r_checkI = value.check_ig ? value.check_ig[i] : 0;
+    r_checkF = value.check_fg ? value.check_fg[i] : 0;
+    r_checkO = value.check_og ? value.check_og[i] : 0;
+
+    if (value.prev_state_value) {
+      r_prev_state = value.prev_state_value[i];
     }
 
-    op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
-       rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state);
-
-    valueIn[i] = rValueIn;
-    valueIg[i] = rValueIg;
-    valueFg[i] = rValueFg;
-    valueOg[i] = rValueOg;
-    value.stateValue[i] = rState;
-    value.stateActiveValue[i] = rStateAtv;
-    value.outputValue[i] = rOut;
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
+       r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
+       active_gate, active_state);
+
+    value_in[i] = r_value_in;
+    value_ig[i] = r_value_ig;
+    value_fg[i] = r_value_fg;
+    value_og[i] = r_value_og;
+    value.state_value[i] = r_state;
+    value.state_active_value[i] = r_state_atv;
+    value.output_value[i] = r_out;
   }
 }
 
 template <class T, class Op>
 void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
-                                      LstmMetaGrad<T> grad, int frameSize,
-                                      activation_mode_t active_node,
-                                      activation_mode_t active_gate,
-                                      activation_mode_t active_state) {
-  T rValueIn;
-  T rValueIg;
-  T rValueFg;
-  T rValueOg;
-  T rGradIn;
-  T rGradIg;
-  T rGradFg;
-  T rGradOg;
-  T rPrevState = 0;
-  T rPrevStateGrad;
-  T rState;
-  T rStateGrad;
-  T rStateAtv;
-  T rOutputGrad;
-  T rCheckI;
-  T rCheckF;
-  T rCheckO;
-  T rCheckIGrad;
-  T rCheckFGrad;
-  T rCheckOGrad;
-
-  T *valueIn = value.gateValue;
-  T *valueIg = value.gateValue + frameSize;
-  T *valueFg = value.gateValue + frameSize * 2;
-  T *valueOg = value.gateValue + frameSize * 3;
-  T *gradIn = grad.gateGrad;
-  T *gradIg = grad.gateGrad + frameSize;
-  T *gradFg = grad.gateGrad + frameSize * 2;
-  T *gradOg = grad.gateGrad + frameSize * 3;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueIn = valueIn[i];
-    rValueIg = valueIg[i];
-    rValueFg = valueFg[i];
-    rValueOg = valueOg[i];
-    rCheckI = value.checkIg ? value.checkIg[i] : 0;
-    rCheckF = value.checkFg ? value.checkFg[i] : 0;
-    rCheckO = value.checkOg ? value.checkOg[i] : 0;
-    rState = value.stateValue[i];
-    rStateAtv = value.stateActiveValue[i];
-    rOutputGrad = grad.outputGrad[i];
-    rStateGrad = grad.stateGrad[i];
-    if (value.prevStateValue) {
-      rPrevState = value.prevStateValue[i];
+                                      LstmMetaGrad<T> grad, int frame_size,
+                                      ActivationType active_node,
+                                      ActivationType active_gate,
+                                      ActivationType active_state) {
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+  T r_grad_in;
+  T r_grad_ig;
+  T r_grad_fg;
+  T r_grad_og;
+  T r_prev_state = 0;
+  T r_prev_state_grad;
+  T r_state;
+  T r_state_grad;
+  T r_state_atv;
+  T r_output_grad;
+  T r_checkI;
+  T r_checkF;
+  T r_checkO;
+  T r_checkIGrad;
+  T r_checkFGrad;
+  T r_checkOGrad;
+
+  T *value_in = value.gate_value;
+  T *value_ig = value.gate_value + frame_size;
+  T *value_fg = value.gate_value + frame_size * 2;
+  T *value_og = value.gate_value + frame_size * 3;
+  T *grad_in = grad.gate_grad;
+  T *grad_ig = grad.gate_grad + frame_size;
+  T *grad_fg = grad.gate_grad + frame_size * 2;
+  T *grad_og = grad.gate_grad + frame_size * 3;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    r_checkI = value.check_ig ? value.check_ig[i] : 0;
+    r_checkF = value.check_fg ? value.check_fg[i] : 0;
+    r_checkO = value.check_og ? value.check_og[i] : 0;
+    r_state = value.state_value[i];
+    r_state_atv = value.state_active_value[i];
+    r_output_grad = grad.output_grad[i];
+    r_state_grad = grad.state_grad[i];
+    if (value.prev_state_value) {
+      r_prev_state = value.prev_state_value[i];
     }
 
-    op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg,
-       rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv,
-       rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad,
-       rCheckOGrad, active_node, active_gate, active_state);
-
-    gradIn[i] = rGradIn;
-    gradIg[i] = rGradIg;
-    gradFg[i] = rGradFg;
-    gradOg[i] = rGradOg;
-    grad.stateGrad[i] = rStateGrad;
-
-    if (grad.prevStateGrad) grad.prevStateGrad[i] = rPrevStateGrad;
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) grad.checkIgGrad[i] += rCheckIGrad;
-      if (grad.checkFgGrad) grad.checkFgGrad[i] += rCheckFGrad;
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
+       r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
+       r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
+       r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
+       active_state);
+
+    grad_in[i] = r_grad_in;
+    grad_ig[i] = r_grad_ig;
+    grad_fg[i] = r_grad_fg;
+    grad_og[i] = r_grad_og;
+    grad.state_grad[i] = r_state_grad;
+
+    if (grad.prev_state_grad) grad.prev_state_grad[i] = r_prev_state_grad;
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad) grad.check_ig_grad[i] += r_checkIGrad;
+      if (grad.check_fg_grad) grad.check_fg_grad[i] += r_checkFGrad;
     }
-    if (grad.checkOgGrad) grad.checkOgGrad[i] += rCheckOGrad;
+    if (grad.check_og_grad) grad.check_og_grad[i] += r_checkOGrad;
   }
 }
 
 template <class T, class Op>
-void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, int frameSize,
-                                   activation_mode_t active_node,
-                                   activation_mode_t active_gate,
-                                   activation_mode_t active_state) {
+void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
+                                   int frame_size, ActivationType active_node,
+                                   ActivationType active_gate,
+                                   ActivationType active_state) {
 #ifdef __AVX__
-  __m256 rValueIn;
-  __m256 rValueIg;
-  __m256 rValueFg;
-  __m256 rValueOg;
-  __m256 rCheckI = _mm256_set1_ps(0.0f);
-  __m256 rCheckF = _mm256_set1_ps(0.0f);
-  __m256 rCheckO = _mm256_set1_ps(0.0f);
-  __m256 rState;
-  __m256 rPrevState = _mm256_set1_ps(0.0f);
-  __m256 rStateAtv;
-  __m256 rOut;
-
-  __m256 *valueIn = (__m256 *)value.gateValue;
-  __m256 *valueIg = (__m256 *)(value.gateValue + frameSize);
-  __m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2);
-  __m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueIn = valueIn[i];
-    rValueIg = valueIg[i];
-    rValueFg = valueFg[i];
-    rValueOg = valueOg[i];
-    if (value.checkIg) {
-      rCheckI = ((__m256 *)value.checkIg)[i];
-      rCheckF = ((__m256 *)value.checkFg)[i];
-      rCheckO = ((__m256 *)value.checkOg)[i];
+  __m256 r_value_in;
+  __m256 r_value_ig;
+  __m256 r_value_fg;
+  __m256 r_value_og;
+  __m256 r_checkI = _mm256_set1_ps(0.0f);
+  __m256 r_checkF = _mm256_set1_ps(0.0f);
+  __m256 r_checkO = _mm256_set1_ps(0.0f);
+  __m256 r_state;
+  __m256 r_prev_state = _mm256_set1_ps(0.0f);
+  __m256 r_state_atv;
+  __m256 r_out;
+
+  __m256 *value_in = (__m256 *)value.gate_value;
+  __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
+  __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
+  __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    if (value.check_ig) {
+      r_checkI = ((__m256 *)value.check_ig)[i];
+      r_checkF = ((__m256 *)value.check_fg)[i];
+      r_checkO = ((__m256 *)value.check_og)[i];
     }
 
-    if (value.prevStateValue) {
-      rPrevState = ((__m256 *)value.prevStateValue)[i];
+    if (value.prev_state_value) {
+      r_prev_state = ((__m256 *)value.prev_state_value)[i];
     }
 
-    op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
-       rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state);
-
-    valueIn[i] = rValueIn;
-    valueIg[i] = rValueIg;
-    valueFg[i] = rValueFg;
-    valueOg[i] = rValueOg;
-    ((__m256 *)value.stateValue)[i] = rState;
-    ((__m256 *)value.stateActiveValue)[i] = rStateAtv;
-    ((__m256 *)value.outputValue)[i] = rOut;
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
+       r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
+       active_gate, active_state);
+
+    value_in[i] = r_value_in;
+    value_ig[i] = r_value_ig;
+    value_fg[i] = r_value_fg;
+    value_og[i] = r_value_og;
+    ((__m256 *)value.state_value)[i] = r_state;
+    ((__m256 *)value.state_active_value)[i] = r_state_atv;
+    ((__m256 *)value.output_value)[i] = r_out;
   }
 #endif
 }
 
 template <class T, class Op>
 void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
-                                    LstmMetaGrad<T> grad, int frameSize,
-                                    activation_mode_t active_node,
-                                    activation_mode_t active_gate,
-                                    activation_mode_t active_state) {
+                                    LstmMetaGrad<T> grad, int frame_size,
+                                    ActivationType active_node,
+                                    ActivationType active_gate,
+                                    ActivationType active_state) {
 #ifdef __AVX__
-  __m256 rValueIn;
-  __m256 rValueIg;
-  __m256 rValueFg;
-  __m256 rValueOg;
-  __m256 rGradIn;
-  __m256 rGradIg;
-  __m256 rGradFg;
-  __m256 rGradOg;
-  __m256 rPrevState = _mm256_set1_ps(0.0f);
-  __m256 rPrevStateGrad;
-  __m256 rStateGrad;
-  __m256 rState;
-  __m256 rStateAtv;
-  __m256 rOutputGrad;
-  __m256 rCheckI = _mm256_set1_ps(0.0f);
-  __m256 rCheckF = _mm256_set1_ps(0.0f);
-  __m256 rCheckO = _mm256_set1_ps(0.0f);
-  __m256 rCheckIGrad;
-  __m256 rCheckFGrad;
-  __m256 rCheckOGrad;
-
-  __m256 *valueIn = (__m256 *)value.gateValue;
-  __m256 *valueIg = (__m256 *)(value.gateValue + frameSize);
-  __m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2);
-  __m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3);
-  __m256 *gradIn = (__m256 *)grad.gateGrad;
-  __m256 *gradIg = (__m256 *)(grad.gateGrad + frameSize);
-  __m256 *gradFg = (__m256 *)(grad.gateGrad + frameSize * 2);
-  __m256 *gradOg = (__m256 *)(grad.gateGrad + frameSize * 3);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueIn = valueIn[i];
-    rValueIg = valueIg[i];
-    rValueFg = valueFg[i];
-    rValueOg = valueOg[i];
-    if (value.checkIg) {
-      rCheckI = ((__m256 *)value.checkIg)[i];
-      rCheckF = ((__m256 *)value.checkFg)[i];
-      rCheckO = ((__m256 *)value.checkOg)[i];
+  __m256 r_value_in;
+  __m256 r_value_ig;
+  __m256 r_value_fg;
+  __m256 r_value_og;
+  __m256 r_grad_in;
+  __m256 r_grad_ig;
+  __m256 r_grad_fg;
+  __m256 r_grad_og;
+  __m256 r_prev_state = _mm256_set1_ps(0.0f);
+  __m256 r_prev_state_grad;
+  __m256 r_state_grad;
+  __m256 r_state;
+  __m256 r_state_atv;
+  __m256 r_output_grad;
+  __m256 r_checkI = _mm256_set1_ps(0.0f);
+  __m256 r_checkF = _mm256_set1_ps(0.0f);
+  __m256 r_checkO = _mm256_set1_ps(0.0f);
+  __m256 r_checkIGrad;
+  __m256 r_checkFGrad;
+  __m256 r_checkOGrad;
+
+  __m256 *value_in = (__m256 *)value.gate_value;
+  __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
+  __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
+  __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
+  __m256 *grad_in = (__m256 *)grad.gate_grad;
+  __m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size);
+  __m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2);
+  __m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    if (value.check_ig) {
+      r_checkI = ((__m256 *)value.check_ig)[i];
+      r_checkF = ((__m256 *)value.check_fg)[i];
+      r_checkO = ((__m256 *)value.check_og)[i];
     }
-    rState = ((__m256 *)value.stateValue)[i];
-    rStateAtv = ((__m256 *)value.stateActiveValue)[i];
-    rOutputGrad = ((__m256 *)grad.outputGrad)[i];
-    rStateGrad = ((__m256 *)grad.stateGrad)[i];
-    if (value.prevStateValue) {
-      rPrevState = ((__m256 *)value.prevStateValue)[i];
+    r_state = ((__m256 *)value.state_value)[i];
+    r_state_atv = ((__m256 *)value.state_active_value)[i];
+    r_output_grad = ((__m256 *)grad.output_grad)[i];
+    r_state_grad = ((__m256 *)grad.state_grad)[i];
+    if (value.prev_state_value) {
+      r_prev_state = ((__m256 *)value.prev_state_value)[i];
     }
 
-    op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg,
-       rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv,
-       rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad,
-       rCheckOGrad, active_node, active_gate, active_state);
-
-    gradIn[i] = rGradIn;
-    gradIg[i] = rGradIg;
-    gradFg[i] = rGradFg;
-    gradOg[i] = rGradOg;
-    ((__m256 *)grad.stateGrad)[i] = rStateGrad;
-
-    if (grad.prevStateGrad) ((__m256 *)grad.prevStateGrad)[i] = rPrevStateGrad;
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) ((__m256 *)grad.checkIgGrad)[i] += rCheckIGrad;
-      if (grad.checkFgGrad) ((__m256 *)grad.checkFgGrad)[i] += rCheckFGrad;
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
+       r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
+       r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
+       r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
+       active_state);
+
+    grad_in[i] = r_grad_in;
+    grad_ig[i] = r_grad_ig;
+    grad_fg[i] = r_grad_fg;
+    grad_og[i] = r_grad_og;
+    ((__m256 *)grad.state_grad)[i] = r_state_grad;
+
+    if (grad.prev_state_grad)
+      ((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad;
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad;
+      if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad;
     }
-    if (grad.checkOgGrad) ((__m256 *)grad.checkOgGrad)[i] += rCheckOGrad;
+    if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad;
   }
 #endif
 }
 
 template <class T, class Op>
-void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frameSize,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate,
-                      activation_mode_t active_state) {
-  if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_forward_one_sequence<T>(op, value, frameSize, active_node,
+void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
+                      ActivationType active_node, ActivationType active_gate,
+                      ActivationType active_state) {
+  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
+    avx_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
                                      active_gate, active_state);
   } else {
-    naive_lstm_forward_one_sequence<T>(op, value, frameSize, active_node,
+    naive_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
                                        active_gate, active_state);
   }
 }
 
 template <class T, class Op>
 void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frameSize, activation_mode_t active_node,
-                       activation_mode_t active_gate,
-                       activation_mode_t active_state) {
-  if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node,
+                       int frame_size, ActivationType active_node,
+                       ActivationType active_gate,
+                       ActivationType active_state) {
+  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
+    avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, active_node,
                                       active_gate, active_state);
   } else {
-    naive_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node,
-                                        active_gate, active_state);
+    naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size,
+                                        active_node, active_gate, active_state);
   }
 }
 
diff --git a/paddle/operators/math/detail/lstm_gpu_kernel.h b/paddle/operators/math/detail/lstm_gpu_kernel.h
index d138bbe411f69929a14ad19af3e84824ac7a5d58..e31e657e8b6964c2b99f6e456545c83d8da8e7f9 100644
--- a/paddle/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_gpu_kernel.h
@@ -26,189 +26,191 @@ namespace math {
 namespace detail {
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class T, class Op, bool isBatch>
-__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frameSize,
-                              int batchSize, activation_mode_t active_node,
-                              activation_mode_t active_gate,
-                              activation_mode_t active_state) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    value.gateValue += batchIdx * frameSize * 4;
-    value.outputValue += batchIdx * frameSize;
-    value.stateValue += batchIdx * frameSize;
-    value.stateActiveValue += batchIdx * frameSize;
+template <class T, class Op, bool is_batch>
+__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
+                              int batch_size, ActivationType active_node,
+                              ActivationType active_gate,
+                              ActivationType active_state) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    value.gate_value += batch_idx * frame_size * 4;
+    value.output_value += batch_idx * frame_size;
+    value.state_value += batch_idx * frame_size;
+    value.state_active_value += batch_idx * frame_size;
   }
 
-  T rState;
-  T rPrevState = 0;
-  T rStateAtv;
-  T rOut;
-  T rValueIn;
-  T rValueIg;
-  T rValueFg;
-  T rValueOg;
-
-  T rCheckI = value.checkIg ? value.checkIg[frameIdx] : 0;
-  T rCheckF = value.checkFg ? value.checkFg[frameIdx] : 0;
-  T rCheckO = value.checkOg ? value.checkOg[frameIdx] : 0;
-
-  rValueIn = value.gateValue[frameIdx];
-  rValueIg = value.gateValue[frameIdx + frameSize];
-  rValueFg = value.gateValue[frameIdx + frameSize * 2];
-  rValueOg = value.gateValue[frameIdx + frameSize * 3];
-
-  if (value.prevStateValue) {
-    if (isBatch) value.prevStateValue += batchIdx * frameSize;
-    rPrevState = value.prevStateValue[frameIdx];
+  T r_state;
+  T r_prev_state = 0;
+  T r_state_atv;
+  T r_out;
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+
+  T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
+  T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
+  T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
+
+  r_value_in = value.gate_value[frame_idx];
+  r_value_ig = value.gate_value[frame_idx + frame_size];
+  r_value_fg = value.gate_value[frame_idx + frame_size * 2];
+  r_value_og = value.gate_value[frame_idx + frame_size * 3];
+
+  if (value.prev_state_value) {
+    if (is_batch) value.prev_state_value += batch_idx * frame_size;
+    r_prev_state = value.prev_state_value[frame_idx];
   }
 
-  op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
-     rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state);
+  op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
+     r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, active_gate,
+     active_state);
 
-  value.gateValue[frameIdx] = rValueIn;
-  value.gateValue[frameIdx + frameSize] = rValueIg;
-  value.gateValue[frameIdx + frameSize * 2] = rValueFg;
-  value.gateValue[frameIdx + frameSize * 3] = rValueOg;
+  value.gate_value[frame_idx] = r_value_in;
+  value.gate_value[frame_idx + frame_size] = r_value_ig;
+  value.gate_value[frame_idx + frame_size * 2] = r_value_fg;
+  value.gate_value[frame_idx + frame_size * 3] = r_value_og;
 
-  value.stateValue[frameIdx] = rState;
-  value.stateActiveValue[frameIdx] = rStateAtv;
-  value.outputValue[frameIdx] = rOut;
+  value.state_value[frame_idx] = r_state;
+  value.state_active_value[frame_idx] = r_state_atv;
+  value.output_value[frame_idx] = r_out;
 }
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class T, class Op, bool isBatch>
+template <class T, class Op, bool is_batch>
 __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
-                               LstmMetaGrad<T> grad, int frameSize,
-                               int batchSize, activation_mode_t active_node,
-                               activation_mode_t active_gate,
-                               activation_mode_t active_state) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    value.gateValue += batchIdx * frameSize * 4;
-    value.stateValue += batchIdx * frameSize;
-    value.stateActiveValue += batchIdx * frameSize;
-    grad.gateGrad += batchIdx * frameSize * 4;
-    grad.stateGrad += batchIdx * frameSize;
-    grad.outputGrad += batchIdx * frameSize;
+                               LstmMetaGrad<T> grad, int frame_size,
+                               int batch_size, ActivationType active_node,
+                               ActivationType active_gate,
+                               ActivationType active_state) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    value.gate_value += batch_idx * frame_size * 4;
+    value.state_value += batch_idx * frame_size;
+    value.state_active_value += batch_idx * frame_size;
+    grad.gate_grad += batch_idx * frame_size * 4;
+    grad.state_grad += batch_idx * frame_size;
+    grad.output_grad += batch_idx * frame_size;
   }
 
-  T rValueIn;
-  T rValueIg;
-  T rValueFg;
-  T rValueOg;
-  T rGradIn;
-  T rGradIg;
-  T rGradFg;
-  T rGradOg;
-  T rPrevState = 0;
-  T rPrevStateGrad;
-  T rState;
-  T rStateGrad;
-  T rStateAtv;
-  T rOutputGrad;
-  T rCheckI = value.checkIg ? value.checkIg[frameIdx] : 0;
-  T rCheckF = value.checkFg ? value.checkFg[frameIdx] : 0;
-  T rCheckO = value.checkOg ? value.checkOg[frameIdx] : 0;
-
-  T rCheckIGrad;
-  T rCheckFGrad;
-  T rCheckOGrad;
-
-  rValueIn = value.gateValue[frameIdx];
-  rValueIg = value.gateValue[frameIdx + frameSize];
-  rValueFg = value.gateValue[frameIdx + frameSize * 2];
-  rValueOg = value.gateValue[frameIdx + frameSize * 3];
-  rState = value.stateValue[frameIdx];
-  rStateAtv = value.stateActiveValue[frameIdx];
-  rOutputGrad = grad.outputGrad[frameIdx];
-  rStateGrad = grad.stateGrad[frameIdx];
-
-  if (value.prevStateValue) {
-    if (isBatch) value.prevStateValue += batchIdx * frameSize;
-    rPrevState = value.prevStateValue[frameIdx];
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+  T r_grad_in;
+  T r_grad_ig;
+  T r_grad_fg;
+  T r_grad_og;
+  T r_prev_state = 0;
+  T r_prev_state_grad;
+  T r_state;
+  T r_state_grad;
+  T r_state_atv;
+  T r_output_grad;
+  T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
+  T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
+  T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
+
+  T r_checkIGrad;
+  T r_checkFGrad;
+  T r_checkOGrad;
+
+  r_value_in = value.gate_value[frame_idx];
+  r_value_ig = value.gate_value[frame_idx + frame_size];
+  r_value_fg = value.gate_value[frame_idx + frame_size * 2];
+  r_value_og = value.gate_value[frame_idx + frame_size * 3];
+  r_state = value.state_value[frame_idx];
+  r_state_atv = value.state_active_value[frame_idx];
+  r_output_grad = grad.output_grad[frame_idx];
+  r_state_grad = grad.state_grad[frame_idx];
+
+  if (value.prev_state_value) {
+    if (is_batch) value.prev_state_value += batch_idx * frame_size;
+    r_prev_state = value.prev_state_value[frame_idx];
   }
 
-  op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg,
-     rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad,
-     rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rCheckOGrad,
-     active_node, active_gate, active_state);
-
-  grad.gateGrad[frameIdx] = rGradIn;
-  grad.gateGrad[frameIdx + frameSize] = rGradIg;
-  grad.gateGrad[frameIdx + frameSize * 2] = rGradFg;
-  grad.gateGrad[frameIdx + frameSize * 3] = rGradOg;
-  grad.stateGrad[frameIdx] = rStateGrad;
-  if (grad.prevStateGrad) {
-    if (isBatch) grad.prevStateGrad += batchIdx * frameSize;
-    grad.prevStateGrad[frameIdx] = rPrevStateGrad;
+  op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
+     r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
+     r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
+     r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
+     active_state);
+
+  grad.gate_grad[frame_idx] = r_grad_in;
+  grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
+  grad.gate_grad[frame_idx + frame_size * 2] = r_grad_fg;
+  grad.gate_grad[frame_idx + frame_size * 3] = r_grad_og;
+  grad.state_grad[frame_idx] = r_state_grad;
+  if (grad.prev_state_grad) {
+    if (is_batch) grad.prev_state_grad += batch_idx * frame_size;
+    grad.prev_state_grad[frame_idx] = r_prev_state_grad;
   }
 
-  if (isBatch) {
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad)
-        paddle::platform::CudaAtomicAdd(grad.checkIgGrad + frameIdx,
-                                        rCheckIGrad);
-      if (grad.checkFgGrad)
-        paddle::platform::CudaAtomicAdd(grad.checkFgGrad + frameIdx,
-                                        rCheckFGrad);
+  if (is_batch) {
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad)
+        paddle::platform::CudaAtomicAdd(grad.check_ig_grad + frame_idx,
+                                        r_checkIGrad);
+      if (grad.check_fg_grad)
+        paddle::platform::CudaAtomicAdd(grad.check_fg_grad + frame_idx,
+                                        r_checkFGrad);
     }
-    if (grad.checkOgGrad)
-      paddle::platform::CudaAtomicAdd(grad.checkOgGrad + frameIdx, rCheckOGrad);
+    if (grad.check_og_grad)
+      paddle::platform::CudaAtomicAdd(grad.check_og_grad + frame_idx,
+                                      r_checkOGrad);
   } else {
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
-      if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad;
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad;
+      if (grad.check_fg_grad) grad.check_fg_grad[frame_idx] += r_checkFGrad;
     }
-    if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad;
+    if (grad.check_og_grad) grad.check_og_grad[frame_idx] += r_checkOGrad;
   }
 }
 
 template <class T, class Op>
 void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
-                      LstmMetaValue<T> value, int frameSize, int batchSize,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate,
-                      activation_mode_t active_state) {
+                      LstmMetaValue<T> value, int frame_size, int batch_size,
+                      ActivationType active_node, ActivationType active_gate,
+                      ActivationType active_state) {
   dim3 threads;
   dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
+  if (batch_size == 1) {
+    int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+    int frame_blocks = (frame_size + 1024 - 1) / 1024;
+    threads = dim3(frame_per_block, 1);
+    grid = dim3(frame_blocks, 1);
   } else {
-    /* framePerBlock = 32 batchPerBlock = 32 */
+    /* frame_per_block = 32 batch_per_block = 32 */
     threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
   }
 
   auto stream =
       reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
-  if (batchSize == 1) {
+  if (batch_size == 1) {
     KeLstmForward<T, Op,
-                  /* isBatch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, frameSize, batchSize, active_node, active_gate,
+                  /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+        op, value, frame_size, batch_size, active_node, active_gate,
         active_state);
   } else {
     KeLstmForward<T, Op,
-                  /* isBatch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, frameSize, batchSize, active_node, active_gate,
+                  /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+        op, value, frame_size, batch_size, active_node, active_gate,
         active_state);
   }
 }
@@ -216,34 +218,33 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
 template <class T, class Op>
 void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
                        LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frameSize, int batchSize,
-                       activation_mode_t active_node,
-                       activation_mode_t active_gate,
-                       activation_mode_t active_state) {
+                       int frame_size, int batch_size,
+                       ActivationType active_node, ActivationType active_gate,
+                       ActivationType active_state) {
   dim3 threads;
   dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
+  if (batch_size == 1) {
+    int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+    int frame_blocks = (frame_size + 1024 - 1) / 1024;
+    threads = dim3(frame_per_block, 1);
+    grid = dim3(frame_blocks, 1);
   } else {
-    /* framePerBlock = 32 batchPerBlock = 16 */
+    /* frame_per_block = 32 batch_per_block = 16 */
     threads = dim3(32, 16);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 16 - 1) / 16);
+    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16);
   }
 
   auto stream =
       reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
-  if (batchSize == 1) {
+  if (batch_size == 1) {
     KeLstmBackward<T, Op,
-                   /* isBatch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, grad, frameSize, batchSize, active_node, active_gate,
+                   /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+        op, value, grad, frame_size, batch_size, active_node, active_gate,
         active_state);
   } else {
     KeLstmBackward<T, Op,
-                   /* isBatch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, grad, frameSize, batchSize, active_node, active_gate,
+                   /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+        op, value, grad, frame_size, batch_size, active_node, active_gate,
         active_state);
   }
 }
diff --git a/paddle/operators/math/detail/lstm_kernel.h b/paddle/operators/math/detail/lstm_kernel.h
index 9daaf91981a8e0252374f528f0e063111bd32675..fed8f9c4ca48905ad4c524ba400e8c7bb2f7fbd1 100644
--- a/paddle/operators/math/detail/lstm_kernel.h
+++ b/paddle/operators/math/detail/lstm_kernel.h
@@ -27,19 +27,19 @@ namespace forward {
 template <class T>
 class lstm {
  public:
-  HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg,
-                             T &prevState, T &state, T &stateAtv, T &output,
+  HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
+                             T &prev_state, T &state, T &state_atv, T &output,
                              T &checkI, T &checkF, T &checkO,
-                             activation_mode_t active_node,
-                             activation_mode_t active_gate,
-                             activation_mode_t active_state) {
-    valueIn = activation(valueIn, active_node);
-    valueIg = activation(valueIg + prevState * checkI, active_gate);
-    valueFg = activation(valueFg + prevState * checkF, active_gate);
-    state = valueIn * valueIg + prevState * valueFg;
-    valueOg = activation(valueOg + state * checkO, active_gate);
-    stateAtv = activation(state, active_state);
-    output = valueOg * stateAtv;
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    value_in = activation(value_in, active_node);
+    value_ig = activation(value_ig + prev_state * checkI, active_gate);
+    value_fg = activation(value_fg + prev_state * checkF, active_gate);
+    state = value_in * value_ig + prev_state * value_fg;
+    value_og = activation(value_og + state * checkO, active_gate);
+    state_atv = activation(state, active_state);
+    output = value_og * state_atv;
   }
 #ifndef __NVCC__
 #ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
@@ -48,24 +48,27 @@ class lstm {
   // Only float support AVX optimization
   static const bool avx = std::is_same<T, float>::value;
 
-  HOSTDEVICE void operator()(__m256 &valueIn, __m256 &valueIg, __m256 &valueFg,
-                             __m256 &valueOg, __m256 &prevState, __m256 &state,
-                             __m256 &stateAtv, __m256 &output, __m256 &checkI,
+  HOSTDEVICE void operator()(__m256 &value_in, __m256 &value_ig,
+                             __m256 &value_fg, __m256 &value_og,
+                             __m256 &prev_state, __m256 &state,
+                             __m256 &state_atv, __m256 &output, __m256 &checkI,
                              __m256 &checkF, __m256 &checkO,
-                             activation_mode_t active_node,
-                             activation_mode_t active_gate,
-                             activation_mode_t active_state) {
-    valueIn = activation(valueIn, active_node);
-    valueIg = activation(
-        _mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI)), active_gate);
-    valueFg = activation(
-        _mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF)), active_gate);
-    state = _mm256_add_ps(_mm256_mul_ps(valueIn, valueIg),
-                          _mm256_mul_ps(prevState, valueFg));
-    valueOg = activation(_mm256_add_ps(valueOg, _mm256_mul_ps(state, checkO)),
-                         active_gate);
-    stateAtv = activation(state, active_state);
-    output = _mm256_mul_ps(valueOg, stateAtv);
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    value_in = activation(value_in, active_node);
+    value_ig =
+        activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)),
+                   active_gate);
+    value_fg =
+        activation(_mm256_add_ps(value_fg, _mm256_mul_ps(prev_state, checkF)),
+                   active_gate);
+    state = _mm256_add_ps(_mm256_mul_ps(value_in, value_ig),
+                          _mm256_mul_ps(prev_state, value_fg));
+    value_og = activation(_mm256_add_ps(value_og, _mm256_mul_ps(state, checkO)),
+                          active_gate);
+    state_atv = activation(state, active_state);
+    output = _mm256_mul_ps(value_og, state_atv);
   }
 #endif
 #endif
@@ -78,25 +81,26 @@ namespace backward {
 template <class T>
 class lstm {
  public:
-  HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg,
-                             T &gradIn, T &gradIg, T &gradFg, T &gradOg,
-                             T &prevState, T &prevStateGrad, T &state,
-                             T &stateGrad, T &stateAtv, T &outputGrad,
+  HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
+                             T &grad_in, T &grad_ig, T &grad_fg, T &grad_og,
+                             T &prev_state, T &prev_state_grad, T &state,
+                             T &state_grad, T &state_atv, T &output_grad,
                              T &checkI, T &checkF, T &checkO, T &checkIGrad,
                              T &checkFGrad, T &checkOGrad,
-                             activation_mode_t active_node,
-                             activation_mode_t active_gate,
-                             activation_mode_t active_state) {
-    gradOg = activation(outputGrad * stateAtv, valueOg, active_gate);
-    stateGrad += activation(outputGrad * valueOg, stateAtv, active_state) +
-                 gradOg * checkO;
-    gradIn = activation(stateGrad * valueIg, valueIn, active_node);
-    gradIg = activation(stateGrad * valueIn, valueIg, active_gate);
-    gradFg = activation(stateGrad * prevState, valueFg, active_gate);
-    prevStateGrad = gradIg * checkI + gradFg * checkF + stateGrad * valueFg;
-    checkIGrad = gradIg * prevState;
-    checkFGrad = gradFg * prevState;
-    checkOGrad = gradOg * state;
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    grad_og = activation(output_grad * state_atv, value_og, active_gate);
+    state_grad += activation(output_grad * value_og, state_atv, active_state) +
+                  grad_og * checkO;
+    grad_in = activation(state_grad * value_ig, value_in, active_node);
+    grad_ig = activation(state_grad * value_in, value_ig, active_gate);
+    grad_fg = activation(state_grad * prev_state, value_fg, active_gate);
+    prev_state_grad =
+        grad_ig * checkI + grad_fg * checkF + state_grad * value_fg;
+    checkIGrad = grad_ig * prev_state;
+    checkFGrad = grad_fg * prev_state;
+    checkOGrad = grad_og * state;
   }
 #ifndef __NVCC__
 #ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
@@ -105,32 +109,32 @@ class lstm {
   // Only float support AVX optimization
   static const bool avx = std::is_same<T, float>::value;
   HOSTDEVICE void operator()(
-      __m256 &valueIn, __m256 &valueIg, __m256 &valueFg, __m256 &valueOg,
-      __m256 &gradIn, __m256 &gradIg, __m256 &gradFg, __m256 &gradOg,
-      __m256 &prevState, __m256 &prevStateGrad, __m256 &state,
-      __m256 &stateGrad, __m256 &stateAtv, __m256 &outputGrad, __m256 &checkI,
-      __m256 &checkF, __m256 &checkO, __m256 &checkIGrad, __m256 &checkFGrad,
-      __m256 &checkOGrad, activation_mode_t active_node,
-      activation_mode_t active_gate, activation_mode_t active_state) {
-    gradOg =
-        activation(_mm256_mul_ps(outputGrad, stateAtv), valueOg, active_gate);
-    stateGrad = _mm256_add_ps(
-        activation(_mm256_mul_ps(outputGrad, valueOg), stateAtv, active_state),
-        stateGrad);
-    stateGrad = _mm256_add_ps(_mm256_mul_ps(gradOg, checkO), stateGrad);
-    gradIn =
-        activation(_mm256_mul_ps(stateGrad, valueIg), valueIn, active_node);
-    gradIg =
-        activation(_mm256_mul_ps(stateGrad, valueIn), valueIg, active_gate);
-    gradFg =
-        activation(_mm256_mul_ps(stateGrad, prevState), valueFg, active_gate);
-    prevStateGrad = _mm256_add_ps(_mm256_mul_ps(gradIg, checkI),
-                                  _mm256_mul_ps(gradFg, checkF));
-    prevStateGrad =
-        _mm256_add_ps(_mm256_mul_ps(stateGrad, valueFg), prevStateGrad);
-    checkIGrad = _mm256_mul_ps(gradIg, prevState);
-    checkFGrad = _mm256_mul_ps(gradFg, prevState);
-    checkOGrad = _mm256_mul_ps(gradOg, state);
+      __m256 &value_in, __m256 &value_ig, __m256 &value_fg, __m256 &value_og,
+      __m256 &grad_in, __m256 &grad_ig, __m256 &grad_fg, __m256 &grad_og,
+      __m256 &prev_state, __m256 &prev_state_grad, __m256 &state,
+      __m256 &state_grad, __m256 &state_atv, __m256 &output_grad,
+      __m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad,
+      __m256 &checkFGrad, __m256 &checkOGrad, ActivationType active_node,
+      ActivationType active_gate, ActivationType active_state) {
+    grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og,
+                         active_gate);
+    state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og),
+                                          state_atv, active_state),
+                               state_grad);
+    state_grad = _mm256_add_ps(_mm256_mul_ps(grad_og, checkO), state_grad);
+    grad_in =
+        activation(_mm256_mul_ps(state_grad, value_ig), value_in, active_node);
+    grad_ig =
+        activation(_mm256_mul_ps(state_grad, value_in), value_ig, active_gate);
+    grad_fg = activation(_mm256_mul_ps(state_grad, prev_state), value_fg,
+                         active_gate);
+    prev_state_grad = _mm256_add_ps(_mm256_mul_ps(grad_ig, checkI),
+                                    _mm256_mul_ps(grad_fg, checkF));
+    prev_state_grad =
+        _mm256_add_ps(_mm256_mul_ps(state_grad, value_fg), prev_state_grad);
+    checkIGrad = _mm256_mul_ps(grad_ig, prev_state);
+    checkFGrad = _mm256_mul_ps(grad_fg, prev_state);
+    checkOGrad = _mm256_mul_ps(grad_og, state);
   }
 #endif
 #endif
diff --git a/paddle/operators/math/detection_util.h b/paddle/operators/math/detection_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3a3ef2badc37924d866ded8ee7a7338fbc4b2d2
--- /dev/null
+++ b/paddle/operators/math/detection_util.h
@@ -0,0 +1,300 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <map>
+#include "paddle/framework/selected_rows.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+struct BBox {
+  BBox(T x_min, T y_min, T x_max, T y_max)
+      : x_min(x_min),
+        y_min(y_min),
+        x_max(x_max),
+        y_max(y_max),
+        is_difficult(false) {}
+
+  BBox() {}
+
+  T get_width() const { return x_max - x_min; }
+
+  T get_height() const { return y_max - y_min; }
+
+  T get_center_x() const { return (x_min + x_max) / 2; }
+
+  T get_center_y() const { return (y_min + y_max) / 2; }
+
+  T get_area() const { return get_width() * get_height(); }
+
+  // coordinate of bounding box
+  T x_min;
+  T y_min;
+  T x_max;
+  T y_max;
+  // whether difficult object (e.g. object with heavy occlusion is difficult)
+  bool is_difficult;
+};
+// KNCHW ==> NHWC
+// template <typename T>
+template <typename T>
+void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
+                          std::vector<BBox<T>>& bbox_vec);
+template <typename T>
+void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
+                             std::vector<std::vector<T>>& var_vec);
+template <typename T>
+BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
+                          const std::vector<T>& prior_bbox_var,
+                          const std::vector<T>& loc_pred_data);
+template <typename T1, typename T2>
+bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
+                          const std::pair<T1, T2>& pair2);
+template <typename T>
+bool SortScorePairDescend(const std::pair<T, BBox<T>>& pair1,
+                          const std::pair<T, BBox<T>>& pair2);
+template <typename T>
+T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2);
+
+template <typename T>
+void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
+                  size_t class_idx, size_t top_k, T conf_threshold,
+                  T nms_threshold, size_t num_priors, size_t num_classes,
+                  std::vector<size_t>* indices);
+template <typename T>
+int GetDetectionIndices(
+    const T* conf_data, const size_t num_priors, const size_t num_classes,
+    const size_t background_label_id, const size_t batch_size,
+    const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
+    const size_t top_k,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
+    std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices);
+template <typename T>
+BBox<T> ClipBBox(const BBox<T>& bbox);
+template <typename T>
+void GetDetectionOutput(
+    const T* conf_data, const size_t num_kept, const size_t num_priors,
+    const size_t num_classes, const size_t batch_size,
+    const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data);
+template <typename T>
+void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
+                          std::vector<BBox<T>>& bbox_vec) {
+  size_t out_offset = bbox_vec.size();
+  bbox_vec.resize(bbox_vec.size() + num_bboxes);
+  for (size_t i = 0; i < num_bboxes; ++i) {
+    BBox<T> bbox;
+    bbox.x_min = *(prior_data + i * 8);
+    bbox.y_min = *(prior_data + i * 8 + 1);
+    bbox.x_max = *(prior_data + i * 8 + 2);
+    bbox.y_max = *(prior_data + i * 8 + 3);
+    bbox_vec[out_offset + i] = bbox;
+  }
+}
+template <typename T>
+void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
+                             std::vector<std::vector<T>>& var_vec) {
+  size_t out_offset = var_vec.size();
+  var_vec.resize(var_vec.size() + num);
+  for (size_t i = 0; i < num; ++i) {
+    std::vector<T> var;
+    var.push_back(*(prior_data + i * 8 + 4));
+    var.push_back(*(prior_data + i * 8 + 5));
+    var.push_back(*(prior_data + i * 8 + 6));
+    var.push_back(*(prior_data + i * 8 + 7));
+    var_vec[out_offset + i] = var;
+  }
+}
+template <typename T>
+BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
+                          const std::vector<T>& prior_bbox_var,
+                          const std::vector<T>& loc_pred_data) {
+  T prior_bbox_width = prior_bbox.get_width();
+  T prior_bbox_height = prior_bbox.get_height();
+  T prior_bbox_center_x = prior_bbox.get_center_x();
+  T prior_bbox_center_y = prior_bbox.get_center_y();
+
+  T decoded_bbox_center_x =
+      prior_bbox_var[0] * loc_pred_data[0] * prior_bbox_width +
+      prior_bbox_center_x;
+  T decoded_bbox_center_y =
+      prior_bbox_var[1] * loc_pred_data[1] * prior_bbox_height +
+      prior_bbox_center_y;
+  T decoded_bbox_width =
+      std::exp(prior_bbox_var[2] * loc_pred_data[2]) * prior_bbox_width;
+  T decoded_bbox_height =
+      std::exp(prior_bbox_var[3] * loc_pred_data[3]) * prior_bbox_height;
+
+  BBox<T> decoded_bbox;
+  decoded_bbox.x_min = decoded_bbox_center_x - decoded_bbox_width / 2;
+  decoded_bbox.y_min = decoded_bbox_center_y - decoded_bbox_height / 2;
+  decoded_bbox.x_max = decoded_bbox_center_x + decoded_bbox_width / 2;
+  decoded_bbox.y_max = decoded_bbox_center_y + decoded_bbox_height / 2;
+
+  return decoded_bbox;
+}
+template <typename T1, typename T2>
+bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
+                          const std::pair<T1, T2>& pair2) {
+  return pair1.first > pair2.first;
+}
+template <typename T>
+T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2) {
+  if (bbox2.x_min > bbox1.x_max || bbox2.x_max < bbox1.x_min ||
+      bbox2.y_min > bbox1.y_max || bbox2.y_max < bbox1.y_min) {
+    return 0.0;
+  } else {
+    T inter_x_min = std::max(bbox1.x_min, bbox2.x_min);
+    T inter_y_min = std::max(bbox1.y_min, bbox2.y_min);
+    T interX_max = std::min(bbox1.x_max, bbox2.x_max);
+    T interY_max = std::min(bbox1.y_max, bbox2.y_max);
+
+    T inter_width = interX_max - inter_x_min;
+    T inter_height = interY_max - inter_y_min;
+    T inter_area = inter_width * inter_height;
+
+    T bbox_area1 = bbox1.get_area();
+    T bbox_area2 = bbox2.get_area();
+
+    return inter_area / (bbox_area1 + bbox_area2 - inter_area);
+  }
+}
+
+template <typename T>
+void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
+                  size_t class_idx, size_t top_k, T conf_threshold,
+                  T nms_threshold, size_t num_priors, size_t num_classes,
+                  std::vector<size_t>* indices) {
+  std::vector<std::pair<T, size_t>> scores;
+  for (size_t i = 0; i < num_priors; ++i) {
+    size_t conf_offset = i * num_classes + class_idx;
+    if (conf_score_data[conf_offset] > conf_threshold)
+      scores.push_back(std::make_pair(conf_score_data[conf_offset], i));
+  }
+  std::stable_sort(scores.begin(), scores.end(),
+                   SortScorePairDescend<T, size_t>);
+  if (top_k > 0 && top_k < scores.size()) scores.resize(top_k);
+  while (scores.size() > 0) {
+    const size_t idx = scores.front().second;
+    bool keep = true;
+    for (size_t i = 0; i < indices->size(); ++i) {
+      if (keep) {
+        const size_t saved_idx = (*indices)[i];
+        T overlap = jaccard_overlap<T>(bboxes[idx], bboxes[saved_idx]);
+        keep = overlap <= nms_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) indices->push_back(idx);
+    scores.erase(scores.begin());
+  }
+}
+template <typename T>
+int GetDetectionIndices(
+    const T* conf_data, const size_t num_priors, const size_t num_classes,
+    const size_t background_label_id, const size_t batch_size,
+    const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
+    const size_t top_k,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
+    std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices) {
+  int total_keep_num = 0;
+  for (size_t n = 0; n < batch_size; ++n) {
+    const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
+    size_t num_detected = 0;
+    std::map<size_t, std::vector<size_t>> indices;
+    size_t conf_offset = n * num_priors * num_classes;
+    for (size_t c = 0; c < num_classes; ++c) {
+      if (c == background_label_id) continue;
+      ApplyNmsFast<T>(decoded_bboxes, conf_data + conf_offset, c, nms_top_k,
+                      conf_threshold, nms_threshold, num_priors, num_classes,
+                      &(indices[c]));
+      num_detected += indices[c].size();
+    }
+    if (top_k > 0 && num_detected > top_k) {
+      // std::vector<pair<T,T>> score_index_pairs;
+      std::vector<std::pair<T, std::pair<size_t, size_t>>> score_index_pairs;
+      for (size_t c = 0; c < num_classes; ++c) {
+        const std::vector<size_t>& label_indices = indices[c];
+        for (size_t i = 0; i < label_indices.size(); ++i) {
+          size_t idx = label_indices[i];
+          score_index_pairs.push_back(
+              std::make_pair((conf_data + conf_offset)[idx * num_classes + c],
+                             std::make_pair(c, idx)));
+        }
+      }
+      std::sort(score_index_pairs.begin(), score_index_pairs.end(),
+                SortScorePairDescend<T, std::pair<size_t, size_t>>);
+      score_index_pairs.resize(top_k);
+      std::map<size_t, std::vector<size_t>> new_indices;
+      for (size_t i = 0; i < score_index_pairs.size(); ++i) {
+        size_t label = score_index_pairs[i].second.first;
+        size_t idx = score_index_pairs[i].second.second;
+        new_indices[label].push_back(idx);
+      }
+      all_detection_indices->push_back(new_indices);
+      total_keep_num += top_k;
+    } else {
+      all_detection_indices->push_back(indices);
+      total_keep_num += num_detected;
+    }
+  }
+  return total_keep_num;
+}
+template <typename T>
+BBox<T> ClipBBox(const BBox<T>& bbox) {
+  T one = static_cast<T>(1.0);
+  T zero = static_cast<T>(0.0);
+  BBox<T> clipped_bbox;
+  clipped_bbox.x_min = std::max(std::min(bbox.x_min, one), zero);
+  clipped_bbox.y_min = std::max(std::min(bbox.y_min, one), zero);
+  clipped_bbox.x_max = std::max(std::min(bbox.x_max, one), zero);
+  clipped_bbox.y_max = std::max(std::min(bbox.y_max, one), zero);
+  return clipped_bbox;
+}
+template <typename T>
+void GetDetectionOutput(
+    const T* conf_data, const size_t num_kept, const size_t num_priors,
+    const size_t num_classes, const size_t batch_size,
+    const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data) {
+  size_t count = 0;
+  for (size_t n = 0; n < batch_size; ++n) {
+    for (std::map<size_t, std::vector<size_t>>::const_iterator it =
+             all_indices[n].begin();
+         it != all_indices[n].end(); ++it) {
+      size_t label = it->first;
+      const std::vector<size_t>& indices = it->second;
+      const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
+      for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        size_t conf_offset = n * num_priors * num_classes + idx * num_classes;
+        out_data[count * 7] = n;
+        out_data[count * 7 + 1] = label;
+        out_data[count * 7 + 2] = (conf_data + conf_offset)[label];
+        BBox<T> clipped_bbox = ClipBBox<T>(decoded_bboxes[idx]);
+        out_data[count * 7 + 3] = clipped_bbox.x_min;
+        out_data[count * 7 + 4] = clipped_bbox.y_min;
+        out_data[count * 7 + 5] = clipped_bbox.x_max;
+        out_data[count * 7 + 6] = clipped_bbox.y_max;
+        ++count;
+      }
+    }
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc
index 125af449d3f700e24be5e4b7615c3b0e03fd4e5b..101ab859624869bf34d171cd42d46d0c5bdac29c 100644
--- a/paddle/operators/math/gru_compute.cc
+++ b/paddle/operators/math/gru_compute.cc
@@ -19,83 +19,85 @@ namespace operators {
 namespace math {
 
 template <typename T>
-struct GRUUnitFunctor<platform::CPUPlace, T> {
-  static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, int frameSize, int batchSize,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate) {
+struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext &context,
+                      GRUMetaValue<T> value, int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
 #ifndef __NVCC__
-    if (value.prevOutValue) {
-      math::gemm<platform::CPUPlace, T>(
-          context, false, false, batchSize, frameSize * 2, frameSize, 1,
-          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
-          value.gateValue, frameSize * 3);
+    if (value.prev_out_value) {
+      math::gemm<platform::CPUDeviceContext, T>(
+          context, false, false, batch_size, frame_size * 2, frame_size, 1,
+          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
+          1, value.gate_value, frame_size * 3);
     }
 
     detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
-                                 frameSize, batchSize, active_gate);
+                                 frame_size, batch_size, active_gate);
 
-    if (value.prevOutValue) {
-      math::gemm<platform::CPUPlace, T>(
-          context, false, false, batchSize, frameSize, frameSize, 1,
-          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
-          value.gateValue + frameSize * 2, frameSize * 3);
+    if (value.prev_out_value) {
+      math::gemm<platform::CPUDeviceContext, T>(
+          context, false, false, batch_size, frame_size, frame_size, 1,
+          value.reset_output_value, frame_size, value.state_weight, frame_size,
+          1, value.gate_value + frame_size * 2, frame_size * 3);
     }
 
     detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
-                                 frameSize, batchSize, active_node);
+                                 frame_size, batch_size, active_node);
 #endif
   }
 };
 
 template <typename T>
-struct GRUUnitGradFunctor<platform::CPUPlace, T> {
-  static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
-                      int batchSize, activation_mode_t active_node,
-                      activation_mode_t active_gate) {
+struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext &context,
+                      GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
 #ifndef __NVCC__
     detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
-                                grad, frameSize, batchSize, active_node);
+                                grad, frame_size, batch_size, active_node);
 
-    if (value.prevOutValue && grad.prevOutGrad) {
-      math::gemm<platform::CPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize, 1,
-          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
-          frameSize, 0, grad.resetOutputGrad, frameSize);
+    if (value.prev_out_value && grad.prev_out_grad) {
+      math::gemm<platform::CPUDeviceContext, T>(
+          context, false, true, batch_size, frame_size, frame_size, 1,
+          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
+          frame_size, 0, grad.reset_output_grad, frame_size);
 
-      if (grad.stateWeightGrad) {
-        math::gemm<platform::CPUPlace, T>(
-            context, true, false, frameSize, frameSize, batchSize, 1,
-            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
-            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
+      if (grad.state_weight_grad) {
+        math::gemm<platform::CPUDeviceContext, T>(
+            context, true, false, frame_size, frame_size, batch_size, 1,
+            value.reset_output_value, frame_size,
+            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+            grad.state_weight_grad, frame_size);
       }
     }
 
     detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
-                                grad, frameSize, batchSize, active_gate);
+                                grad, frame_size, batch_size, active_gate);
 
-    if (grad.prevOutGrad && value.prevOutValue) {
-      math::gemm<platform::CPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize * 2, 1,
-          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
-          grad.prevOutGrad, frameSize);
+    if (grad.prev_out_grad && value.prev_out_value) {
+      math::gemm<platform::CPUDeviceContext, T>(
+          context, false, true, batch_size, frame_size, frame_size * 2, 1,
+          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
+          grad.prev_out_grad, frame_size);
 
-      if (grad.gateWeightGrad) {
-        math::gemm<platform::CPUPlace, T>(
-            context, true, false, frameSize, frameSize * 2, batchSize, 1,
-            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
-            grad.gateWeightGrad, frameSize * 2);
+      if (grad.gate_weight_grad) {
+        math::gemm<platform::CPUDeviceContext, T>(
+            context, true, false, frame_size, frame_size * 2, batch_size, 1,
+            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
+            grad.gate_weight_grad, frame_size * 2);
       }
     }
 #endif
   }
 };
 
-template struct GRUUnitFunctor<platform::CPUPlace, float>;
-template struct GRUUnitFunctor<platform::CPUPlace, double>;
-template struct GRUUnitGradFunctor<platform::CPUPlace, float>;
-template struct GRUUnitGradFunctor<platform::CPUPlace, double>;
+template struct GRUUnitFunctor<platform::CPUDeviceContext, float>;
+template struct GRUUnitFunctor<platform::CPUDeviceContext, double>;
+template struct GRUUnitGradFunctor<platform::CPUDeviceContext, float>;
+template struct GRUUnitGradFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu
index 7b9e54ac029f6aa00553338435684097d6d02b25..d5a0e630ea0eadea990988c3170395c842a91900 100644
--- a/paddle/operators/math/gru_compute.cu
+++ b/paddle/operators/math/gru_compute.cu
@@ -19,159 +19,159 @@ namespace operators {
 namespace math {
 
 template <typename T>
-struct GRUUnitFunctor<platform::GPUPlace, T> {
-  static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, int frameSize, int batchSize,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate) {
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
+struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext &context,
+                      GRUMetaValue<T> value, int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
+    auto stream = context.stream();
     dim3 threads;
     dim3 grid;
-    if (batchSize == 1) {
-      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-      int frameBlocks = (frameSize + 1024 - 1) / 1024;
-      threads = dim3(framePerBlock, 1);
-      grid = dim3(frameBlocks, 1);
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grid = dim3(frame_blocks, 1);
     } else {
       threads = dim3(32, 32);
-      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
     }
 
-    if (value.prevOutValue) {
-      math::gemm<platform::GPUPlace, T>(
-          context, false, false, batchSize, frameSize * 2, frameSize, 1,
-          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
-          value.gateValue, frameSize * 3);
+    if (value.prev_out_value) {
+      math::gemm<platform::CUDADeviceContext, T>(
+          context, false, false, batch_size, frame_size * 2, frame_size, 1,
+          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
+          1, value.gate_value, frame_size * 3);
     }
 
-    if (batchSize == 1) {
+    if (batch_size == 1) {
       detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* isBatch= */ false,
+                                      /* is_batch= */ false,
                                       T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gateValue,
-          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
-          active_gate);
+          detail::forward::gru_resetOutput<T>(), value.gate_value,
+          value.reset_output_value, value.prev_out_value, frame_size,
+          batch_size, active_gate);
     } else {
       detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* isBatch= */ true,
+                                      /* is_batch= */ true,
                                       T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gateValue,
-          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
-          active_gate);
+          detail::forward::gru_resetOutput<T>(), value.gate_value,
+          value.reset_output_value, value.prev_out_value, frame_size,
+          batch_size, active_gate);
     }
 
-    if (value.prevOutValue) {
-      math::gemm<platform::GPUPlace, T>(
-          context, false, false, batchSize, frameSize, frameSize, 1,
-          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
-          value.gateValue + frameSize * 2, frameSize * 3);
+    if (value.prev_out_value) {
+      math::gemm<platform::CUDADeviceContext, T>(
+          context, false, false, batch_size, frame_size, frame_size, 1,
+          value.reset_output_value, frame_size, value.state_weight, frame_size,
+          1, value.gate_value + frame_size * 2, frame_size * 3);
     }
 
-    if (batchSize == 1) {
+    if (batch_size == 1) {
       detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* isBatch= */ false,
+                                      /* is_batch= */ false,
                                       T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gateValue,
-          value.prevOutValue, value.outputValue, frameSize, batchSize,
+          detail::forward::gru_finalOutput<T>(), value.gate_value,
+          value.prev_out_value, value.output_value, frame_size, batch_size,
           active_node);
     } else {
       detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* isBatch= */ true,
+                                      /* is_batch= */ true,
                                       T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gateValue,
-          value.prevOutValue, value.outputValue, frameSize, batchSize,
+          detail::forward::gru_finalOutput<T>(), value.gate_value,
+          value.prev_out_value, value.output_value, frame_size, batch_size,
           active_node);
     }
   }
 };
 
 template <typename T>
-struct GRUUnitGradFunctor<platform::GPUPlace, T> {
-  static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
-                      int batchSize, activation_mode_t active_node,
-                      activation_mode_t active_gate) {
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
+struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext &context,
+                      GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
+    auto stream = context.stream();
     dim3 threads;
     dim3 grid;
-    if (batchSize == 1) {
-      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-      int frameBlocks = (frameSize + 1024 - 1) / 1024;
-      threads = dim3(framePerBlock, 1);
-      grid = dim3(frameBlocks, 1);
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grid = dim3(frame_blocks, 1);
     } else {
       threads = dim3(32, 32);
-      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
     }
 
-    if (batchSize == 1) {
+    if (batch_size == 1) {
       detail::KeGruBackwardStateGrad<
           detail::backward::gru_stateGrad<T>,
-          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
-          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
-          batchSize, active_node);
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.output_grad, frame_size, batch_size, active_node);
     } else {
       detail::KeGruBackwardStateGrad<
           detail::backward::gru_stateGrad<T>,
-          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
-          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
-          batchSize, active_node);
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.output_grad, frame_size, batch_size, active_node);
     }
 
-    if (value.prevOutValue && grad.prevOutGrad) {
-      math::gemm<platform::GPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize, 1,
-          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
-          frameSize, 0, grad.resetOutputGrad, frameSize);
-
-      if (grad.stateWeightGrad) {
-        math::gemm<platform::GPUPlace, T>(
-            context, true, false, frameSize, frameSize, batchSize, 1,
-            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
-            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
+    if (value.prev_out_value && grad.prev_out_grad) {
+      math::gemm<platform::CUDADeviceContext, T>(
+          context, false, true, batch_size, frame_size, frame_size, 1,
+          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
+          frame_size, 0, grad.reset_output_grad, frame_size);
+
+      if (grad.state_weight_grad) {
+        math::gemm<platform::CUDADeviceContext, T>(
+            context, true, false, frame_size, frame_size, batch_size, 1,
+            value.reset_output_value, frame_size,
+            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+            grad.state_weight_grad, frame_size);
       }
     }
 
-    if (batchSize == 1) {
+    if (batch_size == 1) {
       detail::KeGruBackwardResetGrad<
           detail::backward::gru_resetGrad<T>,
-          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
-          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
-          batchSize, active_gate);
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.reset_output_grad, frame_size, batch_size, active_gate);
     } else {
       detail::KeGruBackwardResetGrad<
           detail::backward::gru_resetGrad<T>,
-          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
-          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
-          batchSize, active_gate);
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.reset_output_grad, frame_size, batch_size, active_gate);
     }
 
-    if (grad.prevOutGrad && value.prevOutValue) {
-      math::gemm<platform::GPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize * 2, 1,
-          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
-          grad.prevOutGrad, frameSize);
-
-      if (grad.gateWeightGrad) {
-        math::gemm<platform::GPUPlace, T>(
-            context, true, false, frameSize, frameSize * 2, batchSize, 1,
-            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
-            grad.gateWeightGrad, frameSize * 2);
+    if (grad.prev_out_grad && value.prev_out_value) {
+      math::gemm<platform::CUDADeviceContext, T>(
+          context, false, true, batch_size, frame_size, frame_size * 2, 1,
+          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
+          grad.prev_out_grad, frame_size);
+
+      if (grad.gate_weight_grad) {
+        math::gemm<platform::CUDADeviceContext, T>(
+            context, true, false, frame_size, frame_size * 2, batch_size, 1,
+            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
+            grad.gate_weight_grad, frame_size * 2);
       }
     }
   }
 };
 
-template struct GRUUnitFunctor<platform::GPUPlace, float>;
-template struct GRUUnitFunctor<platform::GPUPlace, double>;
-template struct GRUUnitGradFunctor<platform::GPUPlace, float>;
-template struct GRUUnitGradFunctor<platform::GPUPlace, double>;
+template struct GRUUnitFunctor<platform::CUDADeviceContext, float>;
+template struct GRUUnitFunctor<platform::CUDADeviceContext, double>;
+template struct GRUUnitGradFunctor<platform::CUDADeviceContext, float>;
+template struct GRUUnitGradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/gru_compute.h b/paddle/operators/math/gru_compute.h
index 1475fb38104f353857dfd968e46af98a6d52c52a..bf69147b506661692a6d71823043cd3506ea8b5d 100644
--- a/paddle/operators/math/gru_compute.h
+++ b/paddle/operators/math/gru_compute.h
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/operators/math/detail/activation_functions.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 
@@ -19,41 +19,40 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-// TODO(guosheng): refine code style in gru_compute
 template <typename T>
-struct hl_gru_value {
-  T *gateWeight;
-  T *stateWeight;
-  T *gateValue;
-  T *resetOutputValue;
-  T *outputValue;
-  T *prevOutValue;
+struct GRUMetaValue {
+  T *gate_weight;
+  T *state_weight;
+  T *gate_value;
+  T *reset_output_value;
+  T *output_value;
+  T *prev_out_value;
 };
 
 template <typename T>
-struct hl_gru_grad {
-  T *gateWeightGrad;
-  T *stateWeightGrad;
-  T *gateGrad;
-  T *resetOutputGrad;
-  T *outputGrad;
-  T *prevOutGrad;
+struct GRUMetaGrad {
+  T *gate_weight_grad;
+  T *state_weight_grad;
+  T *gate_grad;
+  T *reset_output_grad;
+  T *output_grad;
+  T *prev_out_grad;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct GRUUnitFunctor {
-  static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, int frameSize, int batchSize,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate);
+  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct GRUUnitGradFunctor {
-  static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
-                      int batchSize, activation_mode_t active_node,
-                      activation_mode_t active_gate);
+  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad, int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate);
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/im2col.cc b/paddle/operators/math/im2col.cc
index c10c44c52076c8ee56eee3a0d82c31df70a1c9c7..c2633b2e16434558d16f699a701e7b8cf1de8342 100644
--- a/paddle/operators/math/im2col.cc
+++ b/paddle/operators/math/im2col.cc
@@ -25,9 +25,9 @@ namespace math {
  */
 template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CPUPlace, T> {
+                    platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& im, const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col) {
@@ -61,14 +61,13 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
 
     const T* im_data = im.data<T>();
     T* col_data = col->data<T>();
-
     for (int c = 0; c < channels_col; ++c) {
       int w_offset = c % filter_width;
       int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / filter_width / filter_height;
+      int c_im = c / (filter_width * filter_height);
       for (int h = 0; h < col_height; ++h) {
+        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
         for (int w = 0; w < col_width; ++w) {
-          int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
           int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
           int col_idx = (c * col_height + h) * col_width + w;
           int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
@@ -90,9 +89,9 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
  */
 template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CPUPlace, T> {
+                    platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
@@ -130,16 +129,14 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
     for (int c = 0; c < channels_col; ++c) {
       int w_offset = c % filter_width;
       int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / filter_width / filter_height;
+      int c_im = c / (filter_width * filter_height);
       for (int h = 0; h < col_height; ++h) {
+        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
         for (int w = 0; w < col_width; ++w) {
-          int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
           int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-
           if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
               (im_col_idx) >= 0 && (im_col_idx) < im_width) {
-            im_row_idx += c_im * im_height;
-            im_data[im_row_idx * im_width + im_col_idx] +=
+            im_data[(im_row_idx + c_im * im_height) * im_width + im_col_idx] +=
                 col_data[(c * col_height + h) * col_width + w];
           }
         }
@@ -149,13 +146,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
 };
 
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUPlace, float>;
+                             platform::CPUDeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUPlace, double>;
+                             platform::CPUDeviceContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUPlace, float>;
+                             platform::CPUDeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUPlace, double>;
+                             platform::CPUDeviceContext, double>;
 
 /*
  * im = [input_channels, input_height, input_width]
@@ -164,9 +161,9 @@ template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
  */
 template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CPUPlace, T> {
+                    platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& im, const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col) {
@@ -199,12 +196,13 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
         for (int channel = 0; channel < im_channels; ++channel) {
           for (int filter_row_idx = 0; filter_row_idx < filter_height;
                ++filter_row_idx) {
+            int im_row_offset =
+                col_row_idx * stride[0] + filter_row_idx - padding[0];
             for (int filter_col_idx = 0; filter_col_idx < filter_width;
                  ++filter_col_idx) {
-              int im_row_offset =
-                  col_row_idx * stride[0] + filter_row_idx - padding[0];
               int im_col_offset =
                   col_col_idx * stride[1] + filter_col_idx - padding[1];
+
               int col_offset =
                   ((((col_row_idx)*col_width + col_col_idx) * im_channels +
                     channel) *
@@ -235,9 +233,9 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
  */
 template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CPUPlace, T> {
+                    platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
@@ -271,12 +269,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
         for (int channel = 0; channel < im_channels; ++channel) {
           for (int filter_row_idx = 0; filter_row_idx < filter_height;
                ++filter_row_idx) {
+            int im_row_offset =
+                col_row_idx * stride[0] + filter_row_idx - padding[0];
             for (int filter_col_idx = 0; filter_col_idx < filter_width;
                  ++filter_col_idx) {
-              int im_row_offset =
-                  col_row_idx * stride[0] + filter_row_idx - padding[0];
               int im_col_offset =
                   col_col_idx * stride[1] + filter_col_idx - padding[1];
+
               int col_offset =
                   (((col_row_idx * col_width + col_col_idx) * im_channels +
                     channel) *
@@ -284,6 +283,7 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                    filter_row_idx) *
                       filter_width +
                   filter_col_idx;
+
               if (im_row_offset >= 0 && im_row_offset < im_height &&
                   im_col_offset >= 0 && im_col_offset < im_width) {
                 int im_offset =
@@ -300,13 +300,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
 };
 
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUPlace, float>;
+                             platform::CPUDeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUPlace, double>;
+                             platform::CPUDeviceContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUPlace, float>;
+                             platform::CPUDeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUPlace, double>;
+                             platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/im2col.cu b/paddle/operators/math/im2col.cu
index bf7894243919571c2ab15d53690b1ef05bfcc6ee..a88e837b030f286cce272f99ad7991c70336e4a9 100644
--- a/paddle/operators/math/im2col.cu
+++ b/paddle/operators/math/im2col.cu
@@ -58,9 +58,9 @@ __global__ void im2col(const T* data_im, int num_outs, int im_height,
  */
 template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::GPUPlace, T> {
+                    platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& im, const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col) {
@@ -96,9 +96,7 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     int block_y = (blocks + 512 - 1) / 512;
     dim3 threads(1024, 1);
     dim3 grid(block_x, block_y);
-    im2col<T><<<grid, threads, 0,
-                reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                    .stream()>>>(
+    im2col<T><<<grid, threads, 0, context.stream()>>>(
         im.data<T>(), num_outputs, im_height, im_width, dilation[0],
         dilation[1], filter_height, filter_width, stride[0], stride[1],
         padding[0], padding[1], col_height, col_width, col->data<T>());
@@ -160,9 +158,9 @@ __global__ void col2im(int n, const T* data_col, int im_height, int im_width,
  */
 template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::GPUPlace, T> {
+                    platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
@@ -203,9 +201,7 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
 
     // To avoid involving atomic operations, we will launch one kernel per
     // bottom dimension, and then in the kernel add up the top dimensions.
-    col2im<T><<<grid, threads, 0,
-                reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                    .stream()>>>(
+    col2im<T><<<grid, threads, 0, context.stream()>>>(
         num_kernels, col.data<T>(), im_height, im_width, dilation[0],
         dilation[1], filter_height, filter_width, stride[0], stride[1],
         padding[0], padding[2], col_height, col_width, im->data<T>());
@@ -213,13 +209,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
 };
 
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::GPUPlace, float>;
+                             platform::CUDADeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::GPUPlace, double>;
+                             platform::CUDADeviceContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::GPUPlace, float>;
+                             platform::CUDADeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::GPUPlace, double>;
+                             platform::CUDADeviceContext, double>;
 
 template <class T>
 __global__ void im2colOCF(const T* im_data, int im_channels, int im_height,
@@ -260,9 +256,9 @@ __global__ void im2colOCF(const T* im_data, int im_channels, int im_height,
  */
 template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::GPUPlace, T> {
+                    platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& im, const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col) {
@@ -310,9 +306,7 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
     int block_dim_z = 1024 / block_dim_x / block_dim_y;
     dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
     dim3 grid(col_width, col_height);
-    im2colOCF<T><<<grid, threads, 0,
-                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                       .stream()>>>(
+    im2colOCF<T><<<grid, threads, 0, context.stream()>>>(
         im.data<T>(), im_channels, im_height, im_width, filter_height,
         filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
         col_width, col->data<T>());
@@ -358,9 +352,9 @@ __global__ void col2imOCF(const T* col_data, int im_channels, int im_height,
  */
 template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::GPUPlace, T> {
+                    platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
@@ -409,9 +403,7 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
     int block_dim_z = 1024 / block_dim_x / block_dim_y;
     dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
     dim3 grid(col_width, col_height);
-    col2imOCF<T><<<grid, threads, 0,
-                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                       .stream()>>>(
+    col2imOCF<T><<<grid, threads, 0, context.stream()>>>(
         col.data<T>(), im_channels, im_height, im_width, filter_height,
         filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
         col_width, im->data<T>());
@@ -419,13 +411,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
 };
 
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::GPUPlace, float>;
+                             platform::CUDADeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::GPUPlace, double>;
+                             platform::CUDADeviceContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::GPUPlace, float>;
+                             platform::CUDADeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::GPUPlace, double>;
+                             platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/im2col.h b/paddle/operators/math/im2col.h
index deb60051beef56437cf75f0fa2cef90bbc0a209a..38f2c9fe0adf80a2a4355a45bebb9ba0f341d1ab 100644
--- a/paddle/operators/math/im2col.h
+++ b/paddle/operators/math/im2col.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
 #include "paddle/platform/device_context.h"
 
 namespace paddle {
@@ -78,20 +79,19 @@ enum class ColFormat { kCFO = 0, kOCF = 1 };
  * \note The caller needs to ensure that imShape.inputChannels is equal to
  *       colShape.inputChannels.
  */
-template <ColFormat Format, typename Place, typename T>
+template <ColFormat Format, typename DeviceContext, typename T>
 class Im2ColFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& im, const std::vector<int>& dilation,
+  void operator()(const DeviceContext& context, const framework::Tensor& im,
+                  const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col);
 };
 
-template <ColFormat Format, typename Place, typename T>
+template <ColFormat Format, typename DeviceContext, typename T>
 class Col2ImFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& col,
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im);
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc
index 10c28da72ba9d3b94bb59c5cf00e7f5a2f28fd06..1ba24325ffe3922081f59abfc1c67c95b514bcfa 100644
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@@ -14,9 +14,8 @@ limitations under the License. */
 
 #include "paddle/operators/math/im2col.h"
 #include <gtest/gtest.h>
-#include <iostream>
 
-template <typename Place>
+template <typename DeviceContext, typename Place>
 void testIm2col() {
   paddle::framework::Tensor input_tmp;
   paddle::framework::Tensor input;
@@ -59,22 +58,11 @@ void testIm2col() {
   memcpy(input_ptr, arr, 6 * sizeof(float));
 
   auto* place = new Place();
-  paddle::platform::DeviceContext* context;
-  if (paddle::platform::is_cpu_place(*place)) {
-    context =
-        new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
-  } else {
-#ifdef PADDLE_WITH_CUDA
-    context =
-        new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
-#else
-    PADDLE_THROW("no GPU support");
-#endif  // PADDLE_WITH_CUDA
-  }
+  DeviceContext* context = new DeviceContext(*place);
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    Copy(input_tmp, *place, *context, &input);
   }
   output_cfo.mutable_data<float>(
       {1, filter_size, filter_size, output_height, output_width}, *place);
@@ -83,10 +71,10 @@ void testIm2col() {
 
   // Im2Col
   paddle::operators::math::Im2ColFunctor<
-      paddle::operators::math::ColFormat::kCFO, Place, float>
+      paddle::operators::math::ColFormat::kCFO, DeviceContext, float>
       im2col;
   paddle::operators::math::Im2ColFunctor<
-      paddle::operators::math::ColFormat::kOCF, Place, float>
+      paddle::operators::math::ColFormat::kOCF, DeviceContext, float>
       im2col_ocf;
 
   im2col(*context, input, dilation, stride, padding, &output_cfo);
@@ -99,7 +87,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output_cfo.data<float>();
   } else {
-    output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context);
+    Copy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp);
     out_cfo_ptr = output_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
@@ -110,19 +98,20 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_ocf_ptr = output_ocf.data<float>();
   } else {
-    output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context);
+    Copy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp);
     out_ocf_ptr = output_tmp.data<float>();
   }
+
   for (int i = 0; i < 6; ++i) {
     EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
   }
 
   // Col2Im: kCFO
   paddle::operators::math::Col2ImFunctor<
-      paddle::operators::math::ColFormat::kCFO, Place, float>
+      paddle::operators::math::ColFormat::kCFO, DeviceContext, float>
       col2im;
   paddle::operators::math::Col2ImFunctor<
-      paddle::operators::math::ColFormat::kOCF, Place, float>
+      paddle::operators::math::ColFormat::kOCF, DeviceContext, float>
       col2im_ocf;
   float col2im_data[] = {0, 2, 2, 3, 8, 5};
 
@@ -130,7 +119,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    Copy(input_tmp, *place, *context, &input);
   }
 
   col2im(*context, output_cfo, dilation, stride, padding, &input);
@@ -139,7 +128,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
+    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
@@ -151,7 +140,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    Copy(input_tmp, *place, *context, &input);
   }
 
   col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
@@ -159,17 +148,21 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
+    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
     EXPECT_EQ(in_ptr[i], col2im_data[i]);
   }
+
+  delete place;
+  delete context;
 }
 
 TEST(math, im2col) {
-  testIm2col<paddle::platform::CPUPlace>();
+  testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #ifdef PADDLE_WITH_CUDA
-  testIm2col<paddle::platform::GPUPlace>();
+  testIm2col<paddle::platform::CUDADeviceContext,
+             paddle::platform::CUDAPlace>();
 #endif
 }
diff --git a/paddle/operators/math/lstm_compute.cc b/paddle/operators/math/lstm_compute.cc
index 0febf8e3b70111d12f858cf6259a2801a42d9a90..d453102ecefc9d79e1f4474ba94be0eb69a87c85 100644
--- a/paddle/operators/math/lstm_compute.cc
+++ b/paddle/operators/math/lstm_compute.cc
@@ -21,61 +21,61 @@ namespace operators {
 namespace math {
 
 template <class T>
-struct LstmUnitFunctor<platform::CPUPlace, T> {
-  static void compute(const platform::DeviceContext& context,
+struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext& context,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
-                      const std::string& gate_act, const std::string& cell_act,
-                      const std::string& cand_act) {
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
     for (int b = 0; b < batch_size; b++) {
       detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
-                               ActiveType(cand_act), ActiveType(gate_act),
-                               ActiveType(cell_act));
-      value.gateValue += frame_size * 4;
-      value.stateValue += frame_size;
-      value.stateActiveValue += frame_size;
-      value.outputValue += frame_size;
-      if (value.prevStateValue) {
-        value.prevStateValue += frame_size;
+                               cand_act, gate_act, cell_act);
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
       }
     }
   }
 };
 
 template <class T>
-struct LstmUnitGradFunctor<platform::CPUPlace, T> {
-  static void compute(const platform::DeviceContext& context,
+struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext& context,
                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
                       int frame_size, int batch_size,
-                      const std::string& gate_act, const std::string& cell_act,
-                      const std::string& cand_act) {
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
     for (int b = 0; b < batch_size; b++) {
       detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
-                                frame_size, ActiveType(cand_act),
-                                ActiveType(gate_act), ActiveType(cell_act));
+                                frame_size, cand_act, gate_act, cell_act);
 
-      value.gateValue += frame_size * 4;
-      value.stateValue += frame_size;
-      value.stateActiveValue += frame_size;
-      value.outputValue += frame_size;
-      if (value.prevStateValue) {
-        value.prevStateValue += frame_size;
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
       }
 
-      grad.gateGrad += frame_size * 4;
-      grad.stateGrad += frame_size;
-      grad.stateActiveGrad += frame_size;
-      grad.outputGrad += frame_size;
-      if (grad.prevStateGrad) {
-        grad.prevStateGrad += frame_size;
+      grad.gate_grad += frame_size * 4;
+      grad.state_grad += frame_size;
+      grad.state_active_grad += frame_size;
+      grad.output_grad += frame_size;
+      if (grad.prev_state_grad) {
+        grad.prev_state_grad += frame_size;
       }
     }
   }
 };
 
-template class LstmUnitFunctor<platform::CPUPlace, float>;
-template class LstmUnitFunctor<platform::CPUPlace, double>;
-template class LstmUnitGradFunctor<platform::CPUPlace, float>;
-template class LstmUnitGradFunctor<platform::CPUPlace, double>;
+template class LstmUnitFunctor<platform::CPUDeviceContext, float>;
+template class LstmUnitFunctor<platform::CPUDeviceContext, double>;
+template class LstmUnitGradFunctor<platform::CPUDeviceContext, float>;
+template class LstmUnitGradFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/lstm_compute.cu b/paddle/operators/math/lstm_compute.cu
index b2122f2a5c08a6d9d53293833177f0ba2c3ab860..82065d699f760db6cc86bf3d6c56e51c583c6ace 100644
--- a/paddle/operators/math/lstm_compute.cu
+++ b/paddle/operators/math/lstm_compute.cu
@@ -21,34 +21,36 @@ namespace operators {
 namespace math {
 
 template <class T>
-struct LstmUnitFunctor<platform::GPUPlace, T> {
-  static void compute(const platform::DeviceContext& context,
+struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext& context,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
-                      const std::string& gate_act, const std::string& cell_act,
-                      const std::string& cand_act) {
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
     detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
-                                frame_size, batch_size, ActiveType(cand_act),
-                                ActiveType(gate_act), ActiveType(cell_act));
+                                frame_size, batch_size, cand_act, gate_act,
+                                cell_act);
   }
 };
 
 template <class T>
-struct LstmUnitGradFunctor<platform::GPUPlace, T> {
-  static void compute(const platform::DeviceContext& context,
+struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext& context,
                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
                       int frame_size, int batch_size,
-                      const std::string& gate_act, const std::string& cell_act,
-                      const std::string& cand_act) {
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
     detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
-                              frame_size, batch_size, ActiveType(cand_act),
-                              ActiveType(gate_act), ActiveType(cell_act));
+                              frame_size, batch_size, cand_act, gate_act,
+                              cell_act);
   }
 };
 
-template class LstmUnitFunctor<platform::GPUPlace, float>;
-template class LstmUnitFunctor<platform::GPUPlace, double>;
-template class LstmUnitGradFunctor<platform::GPUPlace, float>;
-template class LstmUnitGradFunctor<platform::GPUPlace, double>;
+template class LstmUnitFunctor<platform::CUDADeviceContext, float>;
+template class LstmUnitFunctor<platform::CUDADeviceContext, double>;
+template class LstmUnitGradFunctor<platform::CUDADeviceContext, float>;
+template class LstmUnitGradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/lstm_compute.h b/paddle/operators/math/lstm_compute.h
index 28d2c6fd3b0d8143da90c37f241072e37397f98b..e1ad6b64d201ef99d83eaa2a821356008dcc9c8e 100644
--- a/paddle/operators/math/lstm_compute.h
+++ b/paddle/operators/math/lstm_compute.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/operators/math/detail/activation_functions.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 
@@ -21,69 +22,48 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-typedef enum {
-  HL_ACTIVATION_SIGMOID = 0,
-  HL_ACTIVATION_RELU = 1,
-  HL_ACTIVATION_TANH = 2,
-  HL_ACTIVATION_LINEAR = 3,
-  HL_ACTIVATION_END
-} activation_mode_t;
-
 template <class T>
 struct LstmMetaValue {
-  T *gateValue;
-  T *prevStateValue;
-  T *stateValue;
-  T *stateActiveValue;
-  T *outputValue;
-  T *checkIg;
-  T *checkFg;
-  T *checkOg;
+  T *gate_value;
+  T *prev_state_value;
+  T *state_value;
+  T *state_active_value;
+  T *output_value;
+  T *check_ig;
+  T *check_fg;
+  T *check_og;
 };
 
 template <class T>
 struct LstmMetaGrad {
-  T *gateGrad;
-  T *prevStateGrad;
-  T *stateGrad;
-  T *stateActiveGrad;
-  T *outputGrad;
-  T *checkIgGrad;
-  T *checkFgGrad;
-  T *checkOgGrad;
+  T *gate_grad;
+  T *prev_state_grad;
+  T *state_grad;
+  T *state_active_grad;
+  T *output_grad;
+  T *check_ig_grad;
+  T *check_fg_grad;
+  T *check_og_grad;
 };
 
-inline activation_mode_t ActiveType(const std::string &type) {
-  if (type == "sigmoid") {
-    return HL_ACTIVATION_SIGMOID;
-  } else if (type == "relu") {
-    return HL_ACTIVATION_RELU;
-  } else if (type == "tanh") {
-    return HL_ACTIVATION_TANH;
-  } else if (type == "linear" || type == "identity" || type == "") {
-    return HL_ACTIVATION_LINEAR;
-  } else {
-    PADDLE_THROW("Do not support activation type.");
-  }
-}
-
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LstmUnitFunctor {
  public:
-  static void compute(const platform::DeviceContext &context,
-                      LstmMetaValue<T> value, int frame_size, int batch_size,
-                      const std::string &gate_act, const std::string &cell_act,
-                      const std::string &cand_act);
+  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType &gate_act,
+                      const detail::ActivationType &cell_act,
+                      const detail::ActivationType &cand_act);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LstmUnitGradFunctor {
  public:
-  static void compute(const platform::DeviceContext &context,
-                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size,
-                      const std::string &gate_act, const std::string &cell_act,
-                      const std::string &cand_act);
+  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
+                      LstmMetaGrad<T> grad, int frame_size, int batch_size,
+                      const detail::ActivationType &gate_act,
+                      const detail::ActivationType &cell_act,
+                      const detail::ActivationType &cand_act);
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index 2e333a8cde721f8e65dbf2cf5e3aac6272172cc0..dcf4b85e1aadf88e4b1ca70ac7e8b5416fc58cd8 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -21,13 +21,11 @@ namespace operators {
 namespace math {
 
 template <>
-void gemm<platform::CPUPlace, float>(const platform::DeviceContext& context,
-                                     const CBLAS_TRANSPOSE transA,
-                                     const CBLAS_TRANSPOSE transB, const int M,
-                                     const int N, const int K,
-                                     const float alpha, const float* A,
-                                     const float* B, const float beta,
-                                     float* C) {
+void gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
   int lda = (transA == CblasNoTrans) ? K : M;
   int ldb = (transB == CblasNoTrans) ? N : K;
   int ldc = N;
@@ -36,13 +34,11 @@ void gemm<platform::CPUPlace, float>(const platform::DeviceContext& context,
 }
 
 template <>
-void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
-                                      const CBLAS_TRANSPOSE transA,
-                                      const CBLAS_TRANSPOSE transB, const int M,
-                                      const int N, const int K,
-                                      const double alpha, const double* A,
-                                      const double* B, const double beta,
-                                      double* C) {
+void gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
   int lda = (transA == CblasNoTrans) ? K : M;
   int ldb = (transB == CblasNoTrans) ? N : K;
   int ldc = N;
@@ -51,35 +47,32 @@ void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
 }
 
 template <>
-void gemm<platform::CPUPlace, float>(const platform::DeviceContext& context,
-                                     const bool transA, const bool transB,
-                                     const int M, const int N, const int K,
-                                     const float alpha, const float* A,
-                                     const int lda, const float* B,
-                                     const int ldb, const float beta, float* C,
-                                     const int ldc) {
+void gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K, const float alpha,
+    const float* A, const int lda, const float* B, const int ldb,
+    const float beta, float* C, const int ldc) {
   cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
               transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
               lda, B, ldb, beta, C, ldc);
 }
 
 template <>
-void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
-                                      const bool transA, const bool transB,
-                                      const int M, const int N, const int K,
-                                      const double alpha, const double* A,
-                                      const int lda, const double* B,
-                                      const int ldb, const double beta,
-                                      double* C, const int ldc) {
+void gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const int lda, const double* B,
+    const int ldb, const double beta, double* C, const int ldc) {
   cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
               transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
               lda, B, ldb, beta, C, ldc);
 }
 
 template <>
-void matmul<platform::CPUPlace, float>(
-    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
-    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha,
+void matmul<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, float alpha,
     framework::Tensor* matrix_out, float beta) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
@@ -99,15 +92,16 @@ void matmul<platform::CPUPlace, float>(
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
 
-  gemm<platform::CPUPlace, float>(
+  gemm<platform::CPUDeviceContext, float>(
       context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
       matrix_b.data<float>(), beta, matrix_out->data<float>());
 }
 
 template <>
-void matmul<platform::CPUPlace, double>(
-    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
-    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha,
+void matmul<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, double alpha,
     framework::Tensor* matrix_out, double beta) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
@@ -127,16 +121,16 @@ void matmul<platform::CPUPlace, double>(
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
 
-  gemm<platform::CPUPlace, double>(
+  gemm<platform::CPUDeviceContext, double>(
       context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
       matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
 
-#ifdef PADDLE_USE_MKLML
+#ifdef PADDLE_WITH_MKLML
 // Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize.
 template <>
-void batched_gemm<platform::CPUPlace, float>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const float alpha, const float* A, const float* B, const float beta,
     float* C, const int batchCount, const int strideA, const int strideB) {
@@ -157,8 +151,8 @@ void batched_gemm<platform::CPUPlace, float>(
 }
 
 template <>
-void batched_gemm<platform::CPUPlace, double>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const double alpha, const double* A, const double* B, const double beta,
     double* C, const int batchCount, const int strideA, const int strideB) {
@@ -183,8 +177,8 @@ void batched_gemm<platform::CPUPlace, double>(
 // functions of Intel MKL are not available. In the future, this computation
 // should be parallelized.
 template <>
-void batched_gemm<platform::CPUPlace, float>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const float alpha, const float* A, const float* B, const float beta,
     float* C, const int batchCount, const int strideA, const int strideB) {
@@ -192,14 +186,14 @@ void batched_gemm<platform::CPUPlace, float>(
     const float* Ak = &A[k * strideA];
     const float* Bk = &B[k * strideB];
     float* Ck = &C[k * M * N];
-    gemm<platform::CPUPlace, float>(context, transA, transB, M, N, K, alpha, Ak,
-                                    Bk, beta, Ck);
+    gemm<platform::CPUDeviceContext, float>(context, transA, transB, M, N, K,
+                                            alpha, Ak, Bk, beta, Ck);
   }
 }
 
 template <>
-void batched_gemm<platform::CPUPlace, double>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const double alpha, const double* A, const double* B, const double beta,
     double* C, const int batchCount, const int strideA, const int strideB) {
@@ -207,55 +201,56 @@ void batched_gemm<platform::CPUPlace, double>(
     const double* Ak = &A[k * strideA];
     const double* Bk = &B[k * strideB];
     double* Ck = &C[k * M * N];
-    gemm<platform::CPUPlace, double>(context, transA, transB, M, N, K, alpha,
-                                     Ak, Bk, beta, Ck);
+    gemm<platform::CPUDeviceContext, double>(context, transA, transB, M, N, K,
+                                             alpha, Ak, Bk, beta, Ck);
   }
 }
 #endif
 
 template <>
-void gemv<platform::CPUPlace, float>(const platform::DeviceContext& context,
-                                     const bool trans_a, const int M,
-                                     const int N, const float alpha,
-                                     const float* A, const float* B,
-                                     const float beta, float* C) {
+void gemv<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
+    const int N, const float alpha, const float* A, const float* B,
+    const float beta, float* C) {
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   cblas_sgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
 }
 
 template <>
-void gemv<platform::CPUPlace, double>(const platform::DeviceContext& context,
-                                      const bool trans_a, const int M,
-                                      const int N, const double alpha,
-                                      const double* A, const double* B,
-                                      const double beta, double* C) {
+void gemv<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
+    const int N, const double alpha, const double* A, const double* B,
+    const double beta, double* C) {
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
 }
 
 template <>
-void axpy<platform::CPUPlace, float>(const platform::DeviceContext& context,
-                                     const int n, const float alpha,
-                                     const float* x, float* y) {
+void axpy<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const int n, const float alpha,
+    const float* x, float* y) {
   cblas_saxpy(n, alpha, x, 1, y, 1);
 }
 
 template <>
-void axpy<platform::CPUPlace, double>(const platform::DeviceContext& context,
-                                      const int n, const double alpha,
-                                      const double* x, double* y) {
+void axpy<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const int n, const double alpha,
+    const double* x, double* y) {
   cblas_daxpy(n, alpha, x, 1, y, 1);
 }
 
-template struct SetConstant<platform::CPUPlace, float>;
-template struct SetConstant<platform::CPUPlace, double>;
-template struct SetConstant<platform::CPUPlace, int>;
-template struct SetConstant<platform::CPUPlace, int64_t>;
-template struct SetConstant<platform::CPUPlace, bool>;
+template struct SetConstant<platform::CPUDeviceContext, float>;
+template struct SetConstant<platform::CPUDeviceContext, double>;
+template struct SetConstant<platform::CPUDeviceContext, int>;
+template struct SetConstant<platform::CPUDeviceContext, int64_t>;
+template struct SetConstant<platform::CPUDeviceContext, bool>;
 
-#define DEFINE_CPU_TRANS(RANK)                                \
-  template struct Transpose<platform::CPUPlace, float, RANK>; \
-  template struct Transpose<platform::CPUPlace, double, RANK>;
+#define DEFINE_CPU_TRANS(RANK)                                          \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;   \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;  \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;
 
 DEFINE_CPU_TRANS(1);
 DEFINE_CPU_TRANS(2);
@@ -310,10 +305,31 @@ void set_constant(const platform::DeviceContext& context,
 #endif
 }
 
-template struct RowwiseAdd<platform::CPUPlace, float>;
-template struct RowwiseAdd<platform::CPUPlace, double>;
-template struct ColwiseSum<platform::CPUPlace, float>;
-template struct ColwiseSum<platform::CPUPlace, double>;
+template <typename T>
+struct RowwiseAdd<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& vector, framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(vector.numel(), size);
+    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+
+    auto in = framework::EigenMatrix<T>::From(input);
+    auto vec = framework::EigenVector<T>::Flatten(vector);
+    auto out = framework::EigenMatrix<T>::From(*output);
+
+    for (int64_t i = 0; i < in_dims[0]; ++i) {
+      out.chip(i, 0) = in.chip(i, 0) + vec;
+    }
+  }
+};
+
+template struct RowwiseAdd<platform::CPUDeviceContext, float>;
+template struct RowwiseAdd<platform::CPUDeviceContext, double>;
+
+template struct ColwiseSum<platform::CPUDeviceContext, float>;
+template struct ColwiseSum<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index 58356a4b7783241ca0292829bf05dc1a8ed80c6c..d47a7f818ded61baf31e46ea3b8ae3101324111f 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -22,13 +22,11 @@ namespace operators {
 namespace math {
 
 template <>
-void gemm<platform::GPUPlace, float>(const platform::DeviceContext& context,
-                                     const CBLAS_TRANSPOSE transA,
-                                     const CBLAS_TRANSPOSE transB, const int M,
-                                     const int N, const int K,
-                                     const float alpha, const float* A,
-                                     const float* B, const float beta,
-                                     float* C) {
+void gemm<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -39,19 +37,16 @@ void gemm<platform::GPUPlace, float>(const platform::DeviceContext& context,
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 
   PADDLE_ENFORCE(platform::dynload::cublasSgemm(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, N));
 }
 
 template <>
-void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
-                                      const CBLAS_TRANSPOSE transA,
-                                      const CBLAS_TRANSPOSE transB, const int M,
-                                      const int N, const int K,
-                                      const double alpha, const double* A,
-                                      const double* B, const double beta,
-                                      double* C) {
+void gemm<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -61,51 +56,45 @@ void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
   cublasOperation_t cuTransB =
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   PADDLE_ENFORCE(platform::dynload::cublasDgemm(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, N));
 }
 
 template <>
-void gemm<platform::GPUPlace, float>(const platform::DeviceContext& context,
-                                     const bool transA, const bool transB,
-                                     const int M, const int N, const int K,
-                                     const float alpha, const float* A,
-                                     const int lda, const float* B,
-                                     const int ldb, const float beta, float* C,
-                                     const int ldc) {
+void gemm<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K, const float alpha,
+    const float* A, const int lda, const float* B, const int ldb,
+    const float beta, float* C, const int ldc) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
   PADDLE_ENFORCE(platform::dynload::cublasSgemm(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, ldc));
 }
 
 template <>
-void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
-                                      const bool transA, const bool transB,
-                                      const int M, const int N, const int K,
-                                      const double alpha, const double* A,
-                                      const int lda, const double* B,
-                                      const int ldb, const double beta,
-                                      double* C, const int ldc) {
+void gemm<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const int lda, const double* B,
+    const int ldb, const double beta, double* C, const int ldc) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
   PADDLE_ENFORCE(platform::dynload::cublasDgemm(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, ldc));
 }
 
 template <>
-void matmul<platform::GPUPlace, float>(
-    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
-    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha,
+void matmul<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, float alpha,
     framework::Tensor* matrix_out, float beta) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
@@ -116,7 +105,7 @@ void matmul<platform::GPUPlace, float>(
   PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
                      platform::is_gpu_place(matrix_b.place()) &&
                      platform::is_gpu_place(matrix_out->place()),
-                 "Matrix must all be in GPUPlace");
+                 "Matrix must all be in CUDAPlace");
 
   int M = dim_out[0];
   int N = dim_out[1];
@@ -125,15 +114,16 @@ void matmul<platform::GPUPlace, float>(
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
 
-  gemm<platform::GPUPlace, float>(
+  gemm<platform::CUDADeviceContext, float>(
       context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
       matrix_b.data<float>(), beta, matrix_out->data<float>());
 }
 
 template <>
-void matmul<platform::GPUPlace, double>(
-    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
-    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha,
+void matmul<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, double alpha,
     framework::Tensor* matrix_out, double beta) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
@@ -144,7 +134,7 @@ void matmul<platform::GPUPlace, double>(
   PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
                      platform::is_gpu_place(matrix_b.place()) &&
                      platform::is_gpu_place(matrix_out->place()),
-                 "Matrix must all be in GPUPlace");
+                 "Matrix must all be in CUDAPlace");
 
   int M = dim_out[0];
   int N = dim_out[1];
@@ -153,14 +143,14 @@ void matmul<platform::GPUPlace, double>(
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
 
-  gemm<platform::GPUPlace, double>(
+  gemm<platform::CUDADeviceContext, double>(
       context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
       matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
 
 template <>
-void batched_gemm<platform::GPUPlace, float>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const float alpha, const float* A, const float* B, const float beta,
     float* C, const int batchCount, const int strideA, const int strideB) {
@@ -176,15 +166,13 @@ void batched_gemm<platform::GPUPlace, float>(
   const int strideC = M * N;
 
   PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA,
-      &beta, C, ldc, strideC, batchCount));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
+      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
 }
 
 template <>
-void batched_gemm<platform::GPUPlace, double>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const double alpha, const double* A, const double* B, const double beta,
     double* C, const int batchCount, const int strideA, const int strideB) {
@@ -200,68 +188,58 @@ void batched_gemm<platform::GPUPlace, double>(
   const int strideC = M * N;
 
   PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA,
-      &beta, C, ldc, strideC, batchCount));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
+      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
 }
 
 template <>
-void gemv<platform::GPUPlace, float>(const platform::DeviceContext& context,
-                                     const bool trans_a, const int M,
-                                     const int N, const float alpha,
-                                     const float* A, const float* B,
-                                     const float beta, float* C) {
+void gemv<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const bool trans_a, const int M,
+    const int N, const float alpha, const float* A, const float* B,
+    const float beta, float* C) {
   cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
 
-  PADDLE_ENFORCE(platform::dynload::cublasSgemv(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1));
+  PADDLE_ENFORCE(platform::dynload::cublasSgemv(context.cublas_handle(),
+                                                cuTransA, N, M, &alpha, A, N, B,
+                                                1, &beta, C, 1));
 }
 
 template <>
-void gemv<platform::GPUPlace, double>(const platform::DeviceContext& context,
-                                      const bool trans_a, const int M,
-                                      const int N, const double alpha,
-                                      const double* A, const double* B,
-                                      const double beta, double* C) {
+void gemv<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const bool trans_a, const int M,
+    const int N, const double alpha, const double* A, const double* B,
+    const double beta, double* C) {
   cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  PADDLE_ENFORCE(platform::dynload::cublasDgemv(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1));
+  PADDLE_ENFORCE(platform::dynload::cublasDgemv(context.cublas_handle(),
+                                                cuTransA, N, M, &alpha, A, N, B,
+                                                1, &beta, C, 1));
 }
 
 template <>
-void axpy<platform::GPUPlace, float>(const platform::DeviceContext& context,
-                                     const int n, const float alpha,
-                                     const float* x, float* y) {
-  PADDLE_ENFORCE(platform::dynload::cublasSaxpy(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      n, &alpha, x, 1, y, 1));
+void axpy<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const int n, const float alpha,
+    const float* x, float* y) {
+  PADDLE_ENFORCE(platform::dynload::cublasSaxpy(context.cublas_handle(), n,
+                                                &alpha, x, 1, y, 1));
 }
 
 template <>
-void axpy<platform::GPUPlace, double>(const platform::DeviceContext& context,
-                                      const int n, const double alpha,
-                                      const double* x, double* y) {
-  PADDLE_ENFORCE(platform::dynload::cublasDaxpy(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      n, &alpha, x, 1, y, 1));
+void axpy<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const int n, const double alpha,
+    const double* x, double* y) {
+  PADDLE_ENFORCE(platform::dynload::cublasDaxpy(context.cublas_handle(), n,
+                                                &alpha, x, 1, y, 1));
 }
 
-template struct SetConstant<platform::GPUPlace, float>;
-template struct SetConstant<platform::GPUPlace, double>;
-template struct SetConstant<platform::GPUPlace, int>;
-template struct SetConstant<platform::GPUPlace, int64_t>;
-template struct SetConstant<platform::GPUPlace, bool>;
+template struct SetConstant<platform::CUDADeviceContext, float>;
+template struct SetConstant<platform::CUDADeviceContext, double>;
+template struct SetConstant<platform::CUDADeviceContext, int>;
+template struct SetConstant<platform::CUDADeviceContext, int64_t>;
+template struct SetConstant<platform::CUDADeviceContext, bool>;
 
-#define DEFINE_GPU_TRANS(RANK)                                \
-  template struct Transpose<platform::GPUPlace, float, RANK>; \
-  template struct Transpose<platform::GPUPlace, double, RANK>;
+#define DEFINE_GPU_TRANS(RANK)                                         \
+  template struct Transpose<platform::CUDADeviceContext, float, RANK>; \
+  template struct Transpose<platform::CUDADeviceContext, double, RANK>;
 
 DEFINE_GPU_TRANS(1);
 DEFINE_GPU_TRANS(2);
@@ -277,8 +255,9 @@ struct TensorSetConstantGPU {
 
   template <typename T>
   void operator()() const {
-    SetConstant<platform::GPUPlace, T> functor;
-    functor(context_, tensor_, static_cast<T>(value_));
+    SetConstant<platform::CUDADeviceContext, T> functor;
+    functor(reinterpret_cast<const platform::CUDADeviceContext&>(context_),
+            tensor_, static_cast<T>(value_));
   }
 
   const platform::DeviceContext& context_;
@@ -287,17 +266,64 @@ struct TensorSetConstantGPU {
 };
 
 template <>
-void set_constant_with_place<platform::GPUPlace>(
+void set_constant_with_place<platform::CUDAPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
     float value) {
   framework::VisitDataType(framework::ToDataType(tensor->type()),
                            TensorSetConstantGPU(context, tensor, value));
 }
 
-template struct RowwiseAdd<platform::GPUPlace, float>;
-template struct RowwiseAdd<platform::GPUPlace, double>;
-template struct ColwiseSum<platform::GPUPlace, float>;
-template struct ColwiseSum<platform::GPUPlace, double>;
+template <typename T>
+__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width,
+                                 int num) {
+  T tmp = 1.0 / width;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    int h = i * tmp;
+    int w = i - h * width;
+    c[i] = a[i] + b[w];
+  }
+}
+
+template <typename T>
+struct RowwiseAdd<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& vector, framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(vector.numel(), size);
+    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    int blocks = 512;
+    int grids = (input.numel() + blocks - 1) / blocks;
+    RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
+        input.data<T>(), vector.data<T>(), output->data<T>(),
+        static_cast<int>(in_dims[1]), static_cast<int>(input.numel()));
+  }
+};
+
+template struct RowwiseAdd<platform::CUDADeviceContext, float>;
+template struct RowwiseAdd<platform::CUDADeviceContext, double>;
+template struct ColwiseSum<platform::CUDADeviceContext, float>;
+// template struct ColwiseSum<platform::CUDADeviceContext, double>;
+// The ColwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
+// and only failed for this case. So reimplemented it.
+template <>
+void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
+    const platform::CUDADeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* vector) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(vector->numel(), size);
+  framework::Tensor one;
+  one.mutable_data<double>({in_dims[0]}, context.GetPlace());
+  SetConstant<platform::CUDADeviceContext, double> set;
+  set(context, &one, static_cast<double>(1.0));
+  gemv<platform::CUDADeviceContext, double>(
+      context, true, static_cast<int>(in_dims[0]), static_cast<int>(in_dims[1]),
+      1.0, input.data<double>(), one.data<double>(), 0.0,
+      vector->data<double>());
+}
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
index ffb99f53808c4316ede96b04e57aec4dae4134de..8cc03c2ba0facae691a0d2b8a4f2ea768cfa5491 100644
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#ifdef PADDLE_USE_MKLML
+#ifdef PADDLE_WITH_MKLML
 #include <mkl_cblas.h>
 #include <mkl_lapacke.h>
 #include <mkl_vml_functions.h>
@@ -49,6 +49,7 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 
@@ -61,53 +62,51 @@ namespace math {
 // Then matrixA: M * K, matrixB: K * N, matrixC : M * N
 // For more detailed info, please refer to
 // http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
-template <typename Place, typename T>
-void gemm(const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+template <typename DeviceContext, typename T>
+void gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
           const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
           const T alpha, const T* A, const T* B, const T beta, T* C);
 
 // gemm wrapper with stride args for matrix uncontinuous in memory
-template <typename Place, typename T>
-void gemm(const platform::DeviceContext& context, const bool transA,
-          const bool transB, const int M, const int N, const int K,
-          const T alpha, const T* A, const int lda, const T* B, const int ldb,
-          const T beta, T* C, const int ldc);
+template <typename DeviceContext, typename T>
+void gemm(const DeviceContext& context, const bool transA, const bool transB,
+          const int M, const int N, const int K, const T alpha, const T* A,
+          const int lda, const T* B, const int ldb, const T beta, T* C,
+          const int ldc);
 
 // matrix multiply with continuous memory
-template <typename Place, typename T>
-void matmul(const platform::DeviceContext& context,
-            const framework::Tensor& matrix_a, bool trans_a,
-            const framework::Tensor& matrix_b, bool trans_b, T alpha,
-            framework::Tensor* matrix_out, T beta);
+template <typename DeviceContext, typename T>
+void matmul(const DeviceContext& context, const framework::Tensor& matrix_a,
+            bool trans_a, const framework::Tensor& matrix_b, bool trans_b,
+            T alpha, framework::Tensor* matrix_out, T beta);
 
 // Batched gemm
-template <typename Place, typename T>
-void batched_gemm(const platform::DeviceContext& context,
-                  const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-                  const int M, const int N, const int K, const T alpha,
-                  const T* A, const T* B, const T beta, T* C,
-                  const int batchCount, const int strideA, const int strideB);
-
-template <typename Place, typename T>
-void gemv(const platform::DeviceContext& context, const bool trans_a,
-          const int M, const int N, const T alpha, const T* A, const T* B,
-          const T beta, T* C);
-
-template <typename Place, typename T>
-void axpy(const platform::DeviceContext& context, const int n, const T alpha,
-          const T* x, T* y);
-
-template <typename Place, typename T, int Rank>
+template <typename DeviceContext, typename T>
+void batched_gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE transB, const int M, const int N,
+                  const int K, const T alpha, const T* A, const T* B,
+                  const T beta, T* C, const int batchCount, const int strideA,
+                  const int strideB);
+
+template <typename DeviceContext, typename T>
+void gemv(const DeviceContext& context, const bool trans_a, const int M,
+          const int N, const T alpha, const T* A, const T* B, const T beta,
+          T* C);
+
+template <typename DeviceContext, typename T>
+void axpy(const DeviceContext& context, const int n, const T alpha, const T* x,
+          T* y);
+
+template <typename DeviceContext, typename T, int Rank>
 struct Transpose {
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& in, framework::Tensor* out,
-                  const std::vector<int>& axis);
+  void operator()(const DeviceContext& context, const framework::Tensor& in,
+                  framework::Tensor* out, const std::vector<int>& axis);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SetConstant {
-  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor* tensor, T num);
+  void operator()(const DeviceContext& context, framework::Tensor* tensor,
+                  T num);
 };
 
 template <typename Place>
@@ -117,17 +116,16 @@ void set_constant_with_place(const platform::DeviceContext& context,
 void set_constant(const platform::DeviceContext& context,
                   framework::Tensor* tensor, float value);
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct RowwiseAdd {
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, const framework::Tensor& vec,
-                  framework::Tensor* output);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& vec, framework::Tensor* output);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct ColwiseSum {
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* vec);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* vec);
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h
index 4dc17a4e525c52b8f696277274a7ad00a6b00a08..de591626df28e2bc3391b609f909612411398247 100644
--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
@@ -20,16 +20,17 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename Place, typename T>
-void SetConstant<Place, T>::operator()(const platform::DeviceContext& context,
-                                       framework::Tensor* tensor, T num) {
+template <typename DeviceContext, typename T>
+void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
+                                               framework::Tensor* tensor,
+                                               T num) {
   auto t = framework::EigenVector<T>::Flatten(*tensor);
-  t.device(*context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(num));
+  t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
 }
 
-template <typename Place, typename T, int Rank>
-void Transpose<Place, T, Rank>::operator()(
-    const platform::DeviceContext& context, const framework::Tensor& in,
+template <typename DeviceContext, typename T, int Rank>
+void Transpose<DeviceContext, T, Rank>::operator()(
+    const DeviceContext& context, const framework::Tensor& in,
     framework::Tensor* out, const std::vector<int>& axis) {
   Eigen::array<int, Rank> permute;
   for (int i = 0; i < Rank; i++) {
@@ -40,43 +41,51 @@ void Transpose<Place, T, Rank>::operator()(
 
   auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
   auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
-  auto* dev = context.GetEigenDevice<Place>();
+  auto* dev = context.eigen_device();
   eigen_out.device(*dev) = eigen_in.shuffle(permute);
 }
 
-template <typename Place, typename T>
-void RowwiseAdd<Place, T>::operator()(const platform::DeviceContext& context,
-                                      const framework::Tensor& input,
-                                      const framework::Tensor& vector,
-                                      framework::Tensor* output) {
+template <typename DeviceContext, typename T>
+void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
+                                              const framework::Tensor& input,
+                                              framework::Tensor* out) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector.numel(), size);
-  PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+  PADDLE_ENFORCE_EQ(out->numel(), size);
 
   auto in = framework::EigenMatrix<T>::From(input);
-  auto vec = framework::EigenMatrix<T>::From(vector);
-  auto out = framework::EigenMatrix<T>::From(*output);
-  Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
-  Eigen::array<int, 2> bcast({{static_cast<int>(in_dims[0]), 1}});
-  out.device(*context.GetEigenDevice<Place>()) =
-      in + vec.reshape(shape).broadcast(bcast);
+  auto vec = framework::EigenVector<T>::Flatten(*out);
+
+  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
 }
 
-template <typename Place, typename T>
-void ColwiseSum<Place, T>::operator()(const platform::DeviceContext& context,
-                                      const framework::Tensor& input,
-                                      framework::Tensor* vector) {
-  auto in_dims = input.dims();
-  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), size);
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// colwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class ColwiseSum<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), size);
 
-  auto vec = framework::EigenMatrix<T>::From(*vector);
-  auto in = framework::EigenMatrix<T>::From(input);
-  Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
-  vec.reshape(shape).device(*context.GetEigenDevice<Place>()) =
-      in.sum(Eigen::array<int, 1>({{0}})).reshape(shape);
-}
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+        if (i == 0) {
+          out_buf[j] = in_buf[i * size + j];
+        } else {
+          out_buf[j] += in_buf[i * size + j];
+        }
+      }
+    }
+  }
+};
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc
index 983c9fdcffb0a67da1bc0b5b4af9420a68bd2ac1..c9f322b92e5476d889b57bcc91a0ce1d9e5339d5 100644
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #include "paddle/operators/math/math_function.h"
 #include "gtest/gtest.h"
 
@@ -21,7 +34,7 @@ TEST(math_function, gemm_notrans_cblas) {
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::gemm<paddle::platform::CPUPlace, float>(
+  paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>(
       context, false, false, m, n, k, 1, input1_ptr, 3, input2_ptr + 1, 4, 1,
       input3_ptr + 1, 4);
 
@@ -55,7 +68,7 @@ TEST(math_function, gemm_trans_clbas) {
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::gemm<paddle::platform::CPUPlace, float>(
+  paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>(
       context, false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1,
       input3_ptr + 1, 4);
 
@@ -74,7 +87,8 @@ TEST(math_function, zero) {
   auto* cpu_place = new paddle::platform::CPUPlace();
   float* t = tensor.mutable_data<float>({2, 2}, *cpu_place);
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUPlace, float>
+  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
+                                       float>
       functor;
   functor(context, &tensor, 0);
   EXPECT_EQ(t[0], 0);
@@ -110,7 +124,7 @@ void GemvTest(int m, int n, bool trans) {
   }
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::gemv<paddle::platform::CPUPlace, T>(
+  paddle::operators::math::gemv<paddle::platform::CPUDeviceContext, T>(
       context, trans, static_cast<int>(m), static_cast<int>(n), 1., data_a,
       data_b, 0., data_c);
 
diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu
index 780d17ffc6539c5f4d67ebab5476d6f646840b41..6f16d6679248a88425e1de487988e50ee8a469bf 100644
--- a/paddle/operators/math/math_function_test.cu
+++ b/paddle/operators/math/math_function_test.cu
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #include "gtest/gtest.h"
 #include "paddle/operators/math/math_function.h"
 
@@ -13,18 +26,18 @@ TEST(math_function, notrans_mul_trans) {
   float arr[6] = {0, 1, 2, 3, 4, 5};
   memcpy(input1_ptr, arr, 6 * sizeof(float));
 
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom(input1, *gpu_place, context);
-  input2_gpu.CopyFrom(input1, *gpu_place, context);
+  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu);
 
   out_gpu.mutable_data<float>({2, 2}, *gpu_place);
 
-  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
+  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
       context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
 
-  out.CopyFrom(out_gpu, *cpu_place, context);
+  paddle::framework::Copy(out_gpu, *cpu_place, context, &out);
 
   float* out_ptr = out.data<float>();
   context.Wait();
@@ -47,18 +60,18 @@ TEST(math_function, trans_mul_notrans) {
   float arr[6] = {0, 1, 2, 3, 4, 5};
   memcpy(input1_ptr, arr, 6 * sizeof(float));
 
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom(input1, *gpu_place, context);
-  input2_gpu.CopyFrom(input1, *gpu_place, context);
+  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu);
 
   out_gpu.mutable_data<float>({3, 3}, *gpu_place);
 
-  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
+  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
       context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
 
-  out.CopyFrom(out_gpu, *cpu_place, context);
+  paddle::framework::Copy(out_gpu, *cpu_place, context, &out);
 
   float* out_ptr = out.data<float>();
   context.Wait();
@@ -96,20 +109,20 @@ TEST(math_function, gemm_notrans_cublas) {
   float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom(input1, *gpu_place, context);
-  input2_gpu.CopyFrom(input2, *gpu_place, context);
-  input3_gpu.CopyFrom(input3, *gpu_place, context);
+  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu);
+  paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu);
   float* a = input1_gpu.data<float>();
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(*gpu_place);
 
-  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
+  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
       context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
 
-  input3.CopyFrom(input3_gpu, *cpu_place, context);
+  paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3);
 
   // numpy code:
   // a = np.arange(6).reshape(2, 3)
@@ -151,20 +164,20 @@ TEST(math_function, gemm_trans_cublas) {
   float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom(input1, *gpu_place, context);
-  input2_gpu.CopyFrom(input2, *gpu_place, context);
-  input3_gpu.CopyFrom(input3, *gpu_place, context);
+  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu);
+  paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu);
   float* a = input1_gpu.data<float>();
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(*gpu_place);
 
-  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
+  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
       context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
 
-  input3.CopyFrom(input3_gpu, *cpu_place, context);
+  paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3);
   context.Wait();
 
   EXPECT_EQ(input3_ptr[0], 0);
@@ -189,7 +202,7 @@ void GemvTest(int m, int n, bool trans) {
   T* data_b = vec_b.mutable_data<T>({trans ? m : n}, *cpu_place);
   T* data_c = vec_c.mutable_data<T>({trans ? n : m}, *cpu_place);
 
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
   paddle::framework::Tensor g_mat_a;
   paddle::framework::Tensor g_vec_b;
   paddle::framework::Tensor g_vec_c;
@@ -205,14 +218,15 @@ void GemvTest(int m, int n, bool trans) {
   }
 
   paddle::platform::CUDADeviceContext context(*gpu_place);
-  g_mat_a.CopyFrom(mat_a, *gpu_place, context);
-  g_vec_b.CopyFrom(vec_b, *gpu_place, context);
+  paddle::framework::Copy(mat_a, *gpu_place, context, &g_mat_a);
+  paddle::framework::Copy(vec_b, *gpu_place, context, &g_vec_b);
 
-  paddle::operators::math::gemv<paddle::platform::GPUPlace, T>(
+  paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>(
       context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
       g_data_b, 0., g_data_c);
 
-  vec_c.CopyFrom(g_vec_c, paddle::platform::CPUPlace(), context);
+  paddle::framework::Copy(g_vec_c, paddle::platform::CPUPlace(), context,
+                          &vec_c);
 
   if (!trans) {
     for (int i = 0; i < m; ++i) {
diff --git a/paddle/operators/math/matmul.h b/paddle/operators/math/matmul.h
index 6ba9a0ba9a70bd938f9362179990ab68fa3186ba..7048e11e6f27a075892c28681a3c4913a27b3f3e 100644
--- a/paddle/operators/math/matmul.h
+++ b/paddle/operators/math/matmul.h
@@ -26,13 +26,12 @@ namespace math {
 //
 // Both a & b can be 1- to 3-dimensional. Higher rank tensors are not supported
 // yet.
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MatMulFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& a, bool trans_a,
-                  const framework::Tensor& b, bool trans_b, T alpha,
-                  framework::Tensor* out, T beta) {
+  void operator()(const DeviceContext& context, const framework::Tensor& a,
+                  bool trans_a, const framework::Tensor& b, bool trans_b,
+                  T alpha, framework::Tensor* out, T beta) {
     auto dim_a = a.dims();
     auto dim_b = b.dims();
 
@@ -108,13 +107,13 @@ class MatMulFunctor {
 
     if (!batchCount) {
       // regular matrix multiplication
-      gemm<Place, T>(context, transA, transB, M, N, kA, alpha, a.data<T>(),
-                     b.data<T>(), beta, out->data<T>());
+      gemm<DeviceContext, T>(context, transA, transB, M, N, kA, alpha,
+                             a.data<T>(), b.data<T>(), beta, out->data<T>());
     } else {
       // batched matrix multiplication
-      batched_gemm<Place, T>(context, transA, transB, M, N, kA, alpha,
-                             a.data<T>(), b.data<T>(), beta, out->data<T>(),
-                             batchCount, strideA, strideB);
+      batched_gemm<DeviceContext, T>(
+          context, transA, transB, M, N, kA, alpha, a.data<T>(), b.data<T>(),
+          beta, out->data<T>(), batchCount, strideA, strideB);
     }
   }
 };
diff --git a/paddle/operators/math/maxouting.cc b/paddle/operators/math/maxouting.cc
index e5168ce7afd4139475afa6edd5999b9974407c9b..fea86675f75dad99a336d795d4561ae32d58c30a 100644
--- a/paddle/operators/math/maxouting.cc
+++ b/paddle/operators/math/maxouting.cc
@@ -20,11 +20,10 @@ namespace math {
 
 // All tensors are in NCHW format, and the groups must be greater than 1
 template <typename T>
-class MaxOutFunctor<platform::CPUPlace, T> {
+class MaxOutFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor * output,
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
                   int groups) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
@@ -37,34 +36,30 @@ class MaxOutFunctor<platform::CPUPlace, T> {
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; ++i) {
-      int new_bindex =  c_size * i;
+      int new_bindex = c_size * i;
       for (int c = 0; c < output_channels; ++c) {
         int new_cindex = fea_size * c;
         for (int f = 0; f < fea_size; ++f) {
           T ele = static_cast<T>(-FLT_MAX);
           for (int ph = 0; ph < groups; ++ph) {
-            T x = input_data[(new_bindex + new_cindex) * groups
-              + ph * fea_size + f];
+            T x = input_data[(new_bindex + new_cindex) * groups +
+                             ph * fea_size + f];
             ele = ele > x ? ele : x;
           }
-          output_data[(new_bindex+new_cindex+f)] = ele;
+          output_data[(new_bindex + new_cindex + f)] = ele;
         }
       }
     }
   }
 };
 
-
-
 template <class T>
-class MaxOutGradFunctor<platform::CPUPlace, T> {
-public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor * input_grad,
+class MaxOutGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  int groups) {
+                  const framework::Tensor& output_grad, int groups) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -84,11 +79,11 @@ public:
           bool continue_match = true;
           int output_idx = blen + clen + f;
           for (int g = 0; g < groups && continue_match; ++g) {
-              int input_idx = input_idx0 + fea_size * g;
-              if (input_data[input_idx] == output_data[output_idx]) {
-                input_grad_data[input_idx] += output_grad_data[output_idx];
-                continue_match = false;
-              }
+            int input_idx = input_idx0 + fea_size * g;
+            if (input_data[input_idx] == output_data[output_idx]) {
+              input_grad_data[input_idx] += output_grad_data[output_idx];
+              continue_match = false;
+            }
           }
         }
       }
@@ -96,10 +91,10 @@ public:
   }
 };
 
-template class MaxOutGradFunctor<platform::CPUPlace, float>;
-template class MaxOutGradFunctor<platform::CPUPlace, double>;
-template class MaxOutFunctor<platform::CPUPlace, float>;
-template class MaxOutFunctor<platform::CPUPlace, double>;
+template class MaxOutGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxOutGradFunctor<platform::CPUDeviceContext, double>;
+template class MaxOutFunctor<platform::CPUDeviceContext, float>;
+template class MaxOutFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/maxouting.cu b/paddle/operators/math/maxouting.cu
index 7c698577b8a8258a58ba9a2b6c675457b2458a5b..6056ad251c12976fe9032f03aaaeb52727da1f42 100644
--- a/paddle/operators/math/maxouting.cu
+++ b/paddle/operators/math/maxouting.cu
@@ -21,9 +21,9 @@ namespace math {
 
 template <typename T>
 __global__ void KernelMaxOut(const int nthreads, const T* input_data,
-                            const int channels,
-                             const int input_height, const int input_width,
-                             int groups, T* output_data ) {
+                             const int channels, const int input_height,
+                             const int input_width, int groups,
+                             T* output_data) {
   const int size = input_height * input_width * channels / groups;
   const int feat_len = input_height * input_width;
   int index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -34,7 +34,7 @@ __global__ void KernelMaxOut(const int nthreads, const T* input_data,
     int channel_idx = batch_offset / feat_len;
     int feat_idx = batch_offset % feat_len;
     int data_idx =
-      (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
+        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
     T ele = static_cast<T>(-FLT_MAX);
     for (int g = 0; g < groups; ++g) {
       T x = input_data[data_idx + g * feat_len];
@@ -44,43 +44,44 @@ __global__ void KernelMaxOut(const int nthreads, const T* input_data,
   }
 }
 template <typename T>
-__global__ void KernelMaxoutGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, T* input_grad, const int channels,
-    const int input_height, const int input_width, int groups) {
-    const int size = input_height * input_width * channels / groups;
-    const int feat_len = input_height * input_width;
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int offset = blockDim.x * gridDim.x;
-    for (int i = index; i < nthreads; i += offset) {
-      int batch_idx = i / size;
-      int batch_offset = i % size;
-      int channel_idx = batch_offset / feat_len;
-      int feat_idx = batch_offset % feat_len;
-      int data_idx =
+__global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
+                                 const T* output_data, const T* output_grad,
+                                 T* input_grad, const int channels,
+                                 const int input_height, const int input_width,
+                                 int groups) {
+  const int size = input_height * input_width * channels / groups;
+  const int feat_len = input_height * input_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int batch_idx = i / size;
+    int batch_offset = i % size;
+    int channel_idx = batch_offset / feat_len;
+    int feat_idx = batch_offset % feat_len;
+    int data_idx =
         (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
-      int max_index = -1;
-      bool continue_match = true;
-      for (int g = 0; g < groups && continue_match; ++g) {
-        if (input_data[data_idx + g * feat_len] == output_data[i]) {
-          max_index = data_idx + g * feat_len;
-          continue_match = false;
-          break;
-        }
-      }
-      if (max_index != -1) {
-        input_grad[max_index] += output_grad[index];
+    int max_index = -1;
+    bool continue_match = true;
+    for (int g = 0; g < groups && continue_match; ++g) {
+      if (input_data[data_idx + g * feat_len] == output_data[i]) {
+        max_index = data_idx + g * feat_len;
+        continue_match = false;
+        break;
       }
     }
+    if (max_index != -1) {
+      input_grad[max_index] += output_grad[index];
+    }
+  }
 }
 /*
  * All tensors are in NCHW format.
  */
 template <typename T>
-class MaxOutFunctor<platform::GPUPlace, T> {
+class MaxOutFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor * output,
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
                   int groups) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -92,31 +93,26 @@ class MaxOutFunctor<platform::GPUPlace, T> {
 
     const T* input_data = input.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
-    int nthreads =  output->numel();
+    int nthreads = output->numel();
     int blocks = (nthreads + 1024 - 1) / 1024;
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxOut<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, input_data, input_channels,
-                              input_height, input_width, groups,
-                              output_data);
+    KernelMaxOut<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, input_channels, input_height, input_width, groups,
+        output_data);
   }
 };
 /*
  * All tensors are in NCHW format.
  */
 template <typename T>
-class MaxOutGradFunctor<platform::GPUPlace, T> {
+class MaxOutGradFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor * input_grad,
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  int groups) {
+                  const framework::Tensor& output_grad, int groups) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -129,25 +125,22 @@ class MaxOutGradFunctor<platform::GPUPlace, T> {
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-    int nthreads =  output.numel();
+    int nthreads = output.numel();
     int blocks = (nthreads + 1024 - 1) / 1024;
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxoutGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelMaxoutGrad<T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, input_grad_data,
         input_channels, input_height, input_width, groups);
   }
 };
 
-template class MaxOutGradFunctor<platform::GPUPlace, float>;
-template class MaxOutGradFunctor<platform::GPUPlace, double>;
+template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
 
-template class MaxOutFunctor<platform::GPUPlace, float>;
-template class MaxOutFunctor<platform::GPUPlace, double>;
+template class MaxOutFunctor<platform::CUDADeviceContext, float>;
+template class MaxOutFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/maxouting.h b/paddle/operators/math/maxouting.h
index d4c9da38ab8f8d88ed461d805ae64a015db968c4..68f4743db07af0f369eb18f1a7cb6e326d469e85 100644
--- a/paddle/operators/math/maxouting.h
+++ b/paddle/operators/math/maxouting.h
@@ -21,24 +21,20 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-#define FLT_MAX \
-    __FLT_MAX__
-
-template <typename Place, typename T>
+#define FLT_MAX __FLT_MAX__
 
+template <typename DeviceContext, typename T>
 class MaxOutFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor * output,
-                  int groups);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* output, int groups);
 };
 
-template <typename Place, class T>
+template <typename DeviceContext, class T>
 class MaxOutGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor * input_grad,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* input_grad,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, int groups);
 };
diff --git a/paddle/operators/math/pooling.cc b/paddle/operators/math/pooling.cc
index 135984586a67f666425f81456148c3623ed7ef25..150de6fd59ef3ac0c4cb9160bf5afb1ce1064577 100644
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
@@ -24,9 +24,9 @@ namespace math {
  * height and width, respectively.
  */
 template <typename PoolProcess, typename T>
-class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
+class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_process, framework::Tensor* output) {
@@ -84,9 +84,9 @@ class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
 * and width, respectively.
 */
 template <typename PoolProcess, class T>
-class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
+class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -152,9 +152,9 @@ class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
  * height and width, respectively.
  */
 template <class T>
-class MaxPool2dGradFunctor<platform::CPUPlace, T> {
+class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -213,25 +213,29 @@ class MaxPool2dGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class MaxPool2dGradFunctor<platform::CPUPlace, float>;
-template class MaxPool2dGradFunctor<platform::CPUPlace, double>;
+template class MaxPool2dGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxPool2dGradFunctor<platform::CPUDeviceContext, double>;
 
-template class Pool2dFunctor<platform::CPUPlace,
+template class Pool2dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::MaxPool<float>, float>;
-template class Pool2dFunctor<platform::CPUPlace,
+template class Pool2dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::AvgPool<float>, float>;
-template class Pool2dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
-template class Pool2dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
-template class Pool2dFunctor<platform::CPUPlace,
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool2dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::MaxPool<double>, double>;
-template class Pool2dFunctor<platform::CPUPlace,
+template class Pool2dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::AvgPool<double>, double>;
-template class Pool2dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
-template class Pool2dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
 
 /*
  * All tensors are in NCDHW format.
@@ -239,9 +243,9 @@ template class Pool2dGradFunctor<
  * depth, height and width, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
+class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_process, framework::Tensor* output) {
@@ -314,9 +318,9 @@ class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
  * depth, height and width, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
+class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -398,9 +402,9 @@ class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
  * depth, height and width, respectively.
  */
 template <class T>
-class MaxPool3dGradFunctor<platform::CPUPlace, T> {
+class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -473,25 +477,29 @@ class MaxPool3dGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class MaxPool3dGradFunctor<platform::CPUPlace, float>;
-template class MaxPool3dGradFunctor<platform::CPUPlace, double>;
+template class MaxPool3dGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxPool3dGradFunctor<platform::CPUDeviceContext, double>;
 
-template class Pool3dFunctor<platform::CPUPlace,
+template class Pool3dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::MaxPool<float>, float>;
-template class Pool3dFunctor<platform::CPUPlace,
+template class Pool3dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::AvgPool<float>, float>;
-template class Pool3dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
-template class Pool3dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
-template class Pool3dFunctor<platform::CPUPlace,
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool3dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::MaxPool<double>, double>;
-template class Pool3dFunctor<platform::CPUPlace,
+template class Pool3dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::AvgPool<double>, double>;
-template class Pool3dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
-template class Pool3dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
 
 /*
  * All tensors are in NCHW format.
@@ -499,9 +507,9 @@ template class Pool3dGradFunctor<
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexFunctor<platform::CPUPlace, T1, T2> {
+class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* output, framework::Tensor* mask) {
@@ -564,9 +572,9 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T1, T2> {
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
+class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
@@ -602,10 +610,14 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
   }
 };
 
-template class MaxPool2dWithIndexFunctor<platform::CPUPlace, float, int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, float, int>;
-template class MaxPool2dWithIndexFunctor<platform::CPUPlace, double, int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, double, int>;
+template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, float,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, float,
+                                             int>;
+template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, double,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, double,
+                                             int>;
 
 /*
  * All tensors are in NCDHW format.
@@ -613,9 +625,9 @@ template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, double, int>;
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexFunctor<platform::CPUPlace, T1, T2> {
+class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* output, framework::Tensor* mask) {
@@ -692,9 +704,9 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T1, T2> {
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
+class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
@@ -735,10 +747,14 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
   }
 };
 
-template class MaxPool3dWithIndexFunctor<platform::CPUPlace, float, int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, float, int>;
-template class MaxPool3dWithIndexFunctor<platform::CPUPlace, double, int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, double, int>;
+template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, float,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, float,
+                                             int>;
+template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, double,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, double,
+                                             int>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/pooling.cu b/paddle/operators/math/pooling.cu
index ca3560f264b59057fd655084f3d43adc617c6606..0243cf8316a2a83bfc4c091f64419574c1be2f5c 100644
--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
@@ -155,9 +155,9 @@ __global__ void KernelMaxPool2DGrad(
  * height and width, respectively.
  */
 template <typename PoolProcess, typename T>
-class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
+class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_process, framework::Tensor* output) {
@@ -183,11 +183,7 @@ class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool2D<
-        PoolProcess,
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
         stride_width, padding_height, padding_width, pool_process, output_data);
@@ -200,9 +196,9 @@ class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
  * height and width, respectively.
  */
 template <typename PoolProcess, typename T>
-class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
+class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -231,11 +227,7 @@ class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool2DGrad<
-        PoolProcess,
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelPool2DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_height, input_width, output_height, output_width, ksize_height,
         ksize_width, stride_height, stride_width, padding_height, padding_width,
@@ -249,9 +241,9 @@ class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
  * height and width, respectively.
  */
 template <typename T>
-class MaxPool2dGradFunctor<platform::GPUPlace, T> {
+class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -281,10 +273,7 @@ class MaxPool2dGradFunctor<platform::GPUPlace, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool2DGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_height, input_width, output_height, output_width, ksize_height,
         ksize_width, stride_height, stride_width, padding_height, padding_width,
@@ -292,25 +281,29 @@ class MaxPool2dGradFunctor<platform::GPUPlace, T> {
   }
 };
 
-template class MaxPool2dGradFunctor<platform::GPUPlace, float>;
-template class MaxPool2dGradFunctor<platform::GPUPlace, double>;
+template class MaxPool2dGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxPool2dGradFunctor<platform::CUDADeviceContext, double>;
 
-template class Pool2dFunctor<platform::GPUPlace,
+template class Pool2dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::MaxPool<float>, float>;
-template class Pool2dFunctor<platform::GPUPlace,
+template class Pool2dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::AvgPool<float>, float>;
-template class Pool2dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
-template class Pool2dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
-template class Pool2dFunctor<platform::GPUPlace,
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool2dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::MaxPool<double>, double>;
-template class Pool2dFunctor<platform::GPUPlace,
+template class Pool2dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::AvgPool<double>, double>;
-template class Pool2dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
-template class Pool2dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
 
 template <typename PoolProcess, typename T>
 __global__ void KernelPool3D(const int nthreads, const T* input_data,
@@ -478,9 +471,9 @@ __global__ void KernelMaxPool3DGrad(
  * depth, height and width, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
+class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_process, framework::Tensor* output) {
@@ -512,11 +505,7 @@ class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool3D<
-        PoolProcess,
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_depth, input_height,
         input_width, output_depth, output_height, output_width, ksize_depth,
         ksize_height, ksize_width, stride_depth, stride_height, stride_width,
@@ -531,9 +520,9 @@ class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
  * depth, height and width, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
+class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -569,11 +558,7 @@ class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool3DGrad<
-        PoolProcess,
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelPool3DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_depth, input_height, input_width, output_depth, output_height,
         output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
@@ -588,9 +573,9 @@ class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
  * depth, height and width, respectively.
  */
 template <class T>
-class MaxPool3dGradFunctor<platform::GPUPlace, T> {
+class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -626,10 +611,7 @@ class MaxPool3dGradFunctor<platform::GPUPlace, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool3DGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelMaxPool3DGrad<T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_depth, input_height, input_width, output_depth, output_height,
         output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
@@ -638,25 +620,29 @@ class MaxPool3dGradFunctor<platform::GPUPlace, T> {
   }
 };
 
-template class MaxPool3dGradFunctor<platform::GPUPlace, float>;
-template class MaxPool3dGradFunctor<platform::GPUPlace, double>;
+template class MaxPool3dGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxPool3dGradFunctor<platform::CUDADeviceContext, double>;
 
-template class Pool3dFunctor<platform::GPUPlace,
+template class Pool3dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::MaxPool<float>, float>;
-template class Pool3dFunctor<platform::GPUPlace,
+template class Pool3dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::AvgPool<float>, float>;
-template class Pool3dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
-template class Pool3dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
-template class Pool3dFunctor<platform::GPUPlace,
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool3dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::MaxPool<double>, double>;
-template class Pool3dFunctor<platform::GPUPlace,
+template class Pool3dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::AvgPool<double>, double>;
-template class Pool3dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
-template class Pool3dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
 
 template <typename T1, typename T2>
 __global__ void KernelMaxPool2dWithIdx(
@@ -747,9 +733,9 @@ __global__ void KernelMaxPool2DWithIdxGrad(
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexFunctor<platform::GPUPlace, T1, T2> {
+class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* output, framework::Tensor* mask) {
@@ -776,10 +762,7 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T1, T2> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool2dWithIdx<
-        T1, T2><<<grid, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(
+    KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
         stride_width, padding_height, padding_width, output_data, mask_data);
@@ -792,9 +775,9 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T1, T2> {
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
+class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
@@ -821,10 +804,7 @@ class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool2DWithIdxGrad<
-        T1, T2><<<grid, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(
+    KernelMaxPool2DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, output_grad_data, mask_data, input_channels, input_height,
         input_width, output_height, output_width, ksize_height, ksize_width,
         stride_height, stride_width, padding_height, padding_width,
@@ -832,10 +812,14 @@ class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
   }
 };
 
-template class MaxPool2dWithIndexFunctor<platform::GPUPlace, float, int>;
-template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, float, int>;
-template class MaxPool2dWithIndexFunctor<platform::GPUPlace, double, int>;
-template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, double, int>;
+template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, float,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, float,
+                                             int>;
+template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, double,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext,
+                                             double, int>;
 
 template <typename T1, typename T2>
 __global__ void KernelMaxPool3DWithIdx(
@@ -950,9 +934,9 @@ __global__ void KernelMaxPool3DWithIdxGrad(
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexFunctor<platform::GPUPlace, T1, T2> {
+class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* output, framework::Tensor* mask) {
@@ -985,10 +969,7 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T1, T2> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool3DWithIdx<
-        T1, T2><<<grid, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(
+    KernelMaxPool3DWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_depth, input_height,
         input_width, output_depth, output_height, output_width, ksize_depth,
         ksize_height, ksize_width, stride_depth, stride_height, stride_width,
@@ -1002,9 +983,9 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T1, T2> {
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
+class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
@@ -1037,10 +1018,7 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool3DWithIdxGrad<
-        T1, T2><<<grid, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(
+    KernelMaxPool3DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, output_grad_data, mask_data, input_channels, input_depth,
         input_height, input_width, output_depth, output_height, output_width,
         ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
@@ -1049,10 +1027,14 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
   }
 };
 
-template class MaxPool3dWithIndexFunctor<platform::GPUPlace, float, int>;
-template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, float, int>;
-template class MaxPool3dWithIndexFunctor<platform::GPUPlace, double, int>;
-template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, double, int>;
+template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, float,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, float,
+                                             int>;
+template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, double,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext,
+                                             double, int>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/pooling.h b/paddle/operators/math/pooling.h
index 19fbd8b4bb2469d3ce8a139ce30a48641dbd6e0f..2759f06cb6a51f7ceb6b8010d792030eb6ad5d3e 100644
--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
@@ -84,62 +84,58 @@ class AvgPoolGrad {
  * This is different from average pooling. So we rewrite the max_pool_grad:
  * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
  */
-template <typename Place, typename PoolProcess, typename T>
+template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool2dFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute, framework::Tensor* output);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute,
+                  framework::Tensor* output);
 };
 
-template <typename Place, typename PoolProcess, typename T>
+template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool2dGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_compute, framework::Tensor* input_grad);
 };
 
-template <typename Place, class T>
+template <typename DeviceContext, class T>
 class MaxPool2dGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* input_grad);
 };
 
-template <typename Place, typename PoolProcess, typename T>
+template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool3dFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute, framework::Tensor* output);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute,
+                  framework::Tensor* output);
 };
 
-template <typename Place, typename PoolProcess, typename T>
+template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool3dGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_compute, framework::Tensor* input_grad);
 };
 
-template <typename Place, class T>
+template <typename DeviceContext, class T>
 class MaxPool3dGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
@@ -153,38 +149,38 @@ class MaxPool3dGradFunctor {
  * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
  * NCDHW format.
  */
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPool2dWithIndexFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* output, framework::Tensor* mask);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, framework::Tensor* output,
+                  framework::Tensor* mask);
 };
 
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPool2dWithIndexGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* input_grad);
 };
 
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPool3dWithIndexFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* output, framework::Tensor* mask);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, framework::Tensor* output,
+                  framework::Tensor* mask);
 };
 
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPool3dWithIndexGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc
index 075196b47eeaf118a588b96532d87a05e4e600c6..8a1ebb58c26578f076bf243adfbd51d10c682b99 100644
--- a/paddle/operators/math/selected_rows_functor.cc
+++ b/paddle/operators/math/selected_rows_functor.cc
@@ -12,15 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/selected_rows_functor.h"
+#include <set>
+
 #include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 template <typename T>
-struct SelectedRowsAdd<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::SelectedRows& input2,
                   framework::SelectedRows* output) {
@@ -67,12 +69,12 @@ struct SelectedRowsAdd<platform::CPUPlace, T> {
   }
 };
 
-template struct SelectedRowsAdd<platform::CPUPlace, float>;
-template struct SelectedRowsAdd<platform::CPUPlace, double>;
+template struct SelectedRowsAdd<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAdd<platform::CPUDeviceContext, double>;
 
 template <typename T>
-struct SelectedRowsAddTensor<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::Tensor& input2, framework::Tensor* output) {
     auto in1_height = input1.height();
@@ -88,7 +90,7 @@ struct SelectedRowsAddTensor<platform::CPUPlace, T> {
     PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
     PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
 
-    SetConstant<platform::CPUPlace, T> functor;
+    SetConstant<platform::CPUDeviceContext, T> functor;
     functor(context, output, 0.0);
 
     auto* in1_data = in1_value.data<T>();
@@ -103,17 +105,16 @@ struct SelectedRowsAddTensor<platform::CPUPlace, T> {
 
     auto out_eigen = framework::EigenVector<T>::Flatten(*output);
     auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
-    out_eigen.device(*context.GetEigenDevice<platform::CPUPlace>()) =
-        out_eigen + in2_eigen;
+    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
   }
 };
 
-template struct SelectedRowsAddTensor<platform::CPUPlace, float>;
-template struct SelectedRowsAddTensor<platform::CPUPlace, double>;
+template struct SelectedRowsAddTensor<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAddTensor<platform::CPUDeviceContext, double>;
 
 template <typename T>
-struct SelectedRowsAddTo<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& input1,
                   const int64_t input2_offset,
                   framework::SelectedRows* input2) {
@@ -143,12 +144,14 @@ struct SelectedRowsAddTo<platform::CPUPlace, T> {
   }
 };
 
-template struct SelectedRowsAddTo<platform::CPUPlace, float>;
-template struct SelectedRowsAddTo<platform::CPUPlace, double>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, double>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, int>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, int64_t>;
 
 template <typename T>
-struct SelectedRowsAddToTensor<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& input1,
                   framework::Tensor* input2) {
     auto in1_height = input1.height();
@@ -173,9 +176,123 @@ struct SelectedRowsAddToTensor<platform::CPUPlace, T> {
   }
 };
 
-template struct SelectedRowsAddToTensor<platform::CPUPlace, float>;
-template struct SelectedRowsAddToTensor<platform::CPUPlace, double>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
+
+// This is a separated namespace for manipulate SelectedRows typed
+// data. Like merge duplicated rows, adding two SelectedRows etc.
+//
+// Another group of functors is called "scatter updates", which means
+// use SelectedRows to update a dense tensor with different Ops, like
+// add or mul.
+namespace scatter {
+
+size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
+  return std::find(rows.begin(), rows.end(), value) - rows.begin();
+}
+
+template <typename T>
+struct MergeAdd<platform::CPUDeviceContext, T> {
+  framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
+                                     const framework::SelectedRows& input) {
+    framework::SelectedRows out;
+    auto input_rows = input.rows();
+    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
+    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
+
+    auto input_width = input.value().dims()[1];
+    out.set_rows(merge_rows);
+    out.set_height(input.height());
+    out.mutable_value()->mutable_data<T>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merge_rows.size()), input_width}),
+        context.GetPlace());
+
+    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
+    constant_functor(context, out.mutable_value(), 0.0);
+
+    auto* out_data = out.mutable_value()->data<T>();
+    auto* input_data = input.value().data<T>();
+
+    for (size_t i = 0; i < input_rows.size(); i++) {
+      size_t out_i = FindPos(merge_rows, input_rows[i]);
+      for (int64_t j = 0; j < input_width; j++) {
+        out_data[out_i * input_width + j] += input_data[i * input_width + j];
+      }
+    }
+    return out;
+  }
+};
+
+template struct MergeAdd<platform::CPUDeviceContext, float>;
+template struct MergeAdd<platform::CPUDeviceContext, double>;
+template struct MergeAdd<platform::CPUDeviceContext, int>;
+template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
+
+template <typename T>
+struct UpdateToTensor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const ScatterOps& op, const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* input2_data = input2->data<T>();
+
+    // FIXME(typhoonzero): use macro fix the below messy code.
+    switch (op) {
+      case ScatterOps::ASSIGN:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::ADD:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::SUB:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] -=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::SUBBY:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j] -
+            input2_data[in1_rows[i] * in1_row_numel + j];
+        break;
+      case ScatterOps::MUL:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] *=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::DIV:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] /=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::DIVBY:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j] /
+            input2_data[in1_rows[i] * in1_row_numel + j];
+        break;
+    }
+  }
+};
 
+}  // namespace scatter
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu
index 47fe3b44a50fee9f41ae807793187258159b9f29..0ee456f9bc61436bd0f2f8ef20dd1654e7e56d56 100644
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <set>
+
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/selected_rows_functor.h"
 #include "paddle/platform/cuda_helper.h"
@@ -20,8 +22,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 template <typename T>
-struct SelectedRowsAdd<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::SelectedRows& input2,
                   framework::SelectedRows* output) {
@@ -58,22 +60,21 @@ struct SelectedRowsAdd<platform::GPUPlace, T> {
     PADDLE_ENFORCE(platform::is_gpu_place(out_place));
 
     memory::Copy(
-        boost::get<platform::GPUPlace>(out_place), out_data,
-        boost::get<platform::GPUPlace>(in1_place), in1_data,
+        boost::get<platform::CUDAPlace>(out_place), out_data,
+        boost::get<platform::CUDAPlace>(in1_place), in1_data,
         in1_value.numel() * sizeof(T),
         reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
 
     auto* in2_data = in2_value.data<T>();
-    memory::Copy(
-        boost::get<platform::GPUPlace>(out_place), out_data + in1_value.numel(),
-        boost::get<platform::GPUPlace>(in2_place), in2_data,
-        in2_value.numel() * sizeof(T),
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
+    memory::Copy(boost::get<platform::CUDAPlace>(out_place),
+                 out_data + in1_value.numel(),
+                 boost::get<platform::CUDAPlace>(in2_place), in2_data,
+                 in2_value.numel() * sizeof(T), context.stream());
   }
 };
 
-template struct SelectedRowsAdd<platform::GPUPlace, float>;
-template struct SelectedRowsAdd<platform::GPUPlace, double>;
+template struct SelectedRowsAdd<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAdd<platform::CUDADeviceContext, double>;
 
 namespace {
 template <typename T, int block_size>
@@ -96,8 +97,8 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
 }  // namespace
 
 template <typename T>
-struct SelectedRowsAddTensor<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::Tensor& input2, framework::Tensor* output) {
     auto in1_height = input1.height();
@@ -117,30 +118,28 @@ struct SelectedRowsAddTensor<platform::GPUPlace, T> {
     auto* in2_data = input2.data<T>();
     auto* out_data = output->data<T>();
 
-    SetConstant<platform::GPUPlace, T> functor;
+    SetConstant<platform::CUDADeviceContext, T> functor;
     functor(context, output, 0.0);
 
     const int block_size = 256;
     dim3 threads(block_size, 1);
     dim3 grid(1, in1_rows.size());
-    SelectedRowsAddTensorKernel<T, block_size><<<
-        grid, threads, 0,
-        reinterpret_cast<const platform::CUDADeviceContext&>(context)
-            .stream()>>>(in1_data, in1_rows.data(), out_data, in1_row_numel);
+    SelectedRowsAddTensorKernel<
+        T, block_size><<<grid, threads, 0, context.stream()>>>(
+        in1_data, in1_rows.data(), out_data, in1_row_numel);
 
     auto out_eigen = framework::EigenVector<T>::Flatten(*output);
     auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
-    out_eigen.device(*context.GetEigenDevice<platform::GPUPlace>()) =
-        out_eigen + in2_eigen;
+    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
   }
 };
 
-template struct SelectedRowsAddTensor<platform::GPUPlace, float>;
-template struct SelectedRowsAddTensor<platform::GPUPlace, double>;
+template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
 
 template <typename T>
-struct SelectedRowsAddTo<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& input1,
                   const int64_t input2_offset,
                   framework::SelectedRows* input2) {
@@ -163,16 +162,17 @@ struct SelectedRowsAddTo<platform::GPUPlace, T> {
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
-    memory::Copy(
-        boost::get<platform::GPUPlace>(in2_place), in2_data + input2_offset,
-        boost::get<platform::GPUPlace>(in1_place), in1_data,
-        in1_value.numel() * sizeof(T),
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
+    memory::Copy(boost::get<platform::CUDAPlace>(in2_place),
+                 in2_data + input2_offset,
+                 boost::get<platform::CUDAPlace>(in1_place), in1_data,
+                 in1_value.numel() * sizeof(T), context.stream());
   }
 };
 
-template struct SelectedRowsAddTo<platform::GPUPlace, float>;
-template struct SelectedRowsAddTo<platform::GPUPlace, double>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, double>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, int>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, int64_t>;
 
 namespace {
 template <typename T, int block_size>
@@ -195,8 +195,8 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
 }  // namespace
 
 template <typename T>
-struct SelectedRowsAddToTensor<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& input1,
                   framework::Tensor* input2) {
     auto in1_height = input1.height();
@@ -214,16 +214,167 @@ struct SelectedRowsAddToTensor<platform::GPUPlace, T> {
     const int block_size = 256;
     dim3 threads(block_size, 1);
     dim3 grid(1, in1_rows.size());
-    SelectedRowsAddToTensorKernel<T, block_size><<<
-        grid, threads, 0,
-        reinterpret_cast<const platform::CUDADeviceContext&>(context)
-            .stream()>>>(in1_data, in1_rows.data(), in2_data, in1_row_numel);
+    SelectedRowsAddToTensorKernel<
+        T, block_size><<<grid, threads, 0, context.stream()>>>(
+        in1_data, in1_rows.data(), in2_data, in1_row_numel);
   }
 };
 
-template struct SelectedRowsAddToTensor<platform::GPUPlace, float>;
-template struct SelectedRowsAddToTensor<platform::GPUPlace, double>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
+
+namespace scatter {
 
+template <typename T, int block_size>
+__global__ void MergeAddKernel(const T* input, const int64_t* input_rows,
+                               T* out, const int64_t* out_rows,
+                               size_t out_rows_size, int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+  __shared__ size_t out_idx;
+
+  if (tid == 0) {
+    for (size_t i = 0; i < out_rows_size; i++) {
+      if (input_rows[ty] == out_rows[i]) {
+        out_idx = i;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  input += ty * row_numel;
+  out += out_idx * row_numel;
+  for (int index = tid; index < row_numel; index += block_size) {
+    paddle::platform::CudaAtomicAdd(out + index, input[index]);
+  }
+}
+
+template <typename T>
+struct MergeAdd<platform::CUDADeviceContext, T> {
+  framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
+                                     const framework::SelectedRows& input) {
+    framework::SelectedRows out;
+    auto input_rows = input.rows();
+    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
+    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
+
+    auto input_width = input.value().dims()[1];
+
+    out.set_rows(merge_rows);
+    out.set_height(input.height());
+    out.mutable_value()->mutable_data<T>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merge_rows.size()), input_width}),
+        context.GetPlace());
+
+    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
+    constant_functor(context, out.mutable_value(), 0.0);
+
+    auto* out_data = out.mutable_value()->data<T>();
+    auto* input_data = input.value().data<T>();
+
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid1(1, input_rows.size());
+
+    MergeAddKernel<
+        T, 256><<<grid1, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(input_data, input.rows().data(), out_data,
+                                   out.rows().data(), out.rows().size(),
+                                   input_width);
+    return out;
+  }
+};
+
+template struct MergeAdd<platform::CUDADeviceContext, float>;
+template struct MergeAdd<platform::CUDADeviceContext, double>;
+template struct MergeAdd<platform::CUDADeviceContext, int>;
+template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
+
+template <typename T, int block_size>
+__global__ void UpdateToTensorKernel(const T* selected_rows,
+                                     const int64_t* rows, const ScatterOps& op,
+                                     T* tensor_out, int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  selected_rows += ty * row_numel;
+  tensor_out += rows[ty] * row_numel;
+  // FIXME(typhoonzero): use macro fix the below messy code.
+  switch (op) {
+    case ScatterOps::ASSIGN:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] = selected_rows[index];
+      }
+      break;
+    case ScatterOps::ADD:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] += selected_rows[index];
+      }
+      break;
+    case ScatterOps::SUB:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] -= selected_rows[index];
+      }
+      break;
+    case ScatterOps::SUBBY:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] = selected_rows[index] - tensor_out[index];
+      }
+      break;
+    case ScatterOps::MUL:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] *= selected_rows[index];
+      }
+      break;
+    case ScatterOps::DIV:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] /= selected_rows[index];
+      }
+      break;
+    case ScatterOps::DIVBY:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] = selected_rows[index] / tensor_out[index];
+      }
+      break;
+  }
+}
+
+template <typename T>
+struct UpdateToTensor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const ScatterOps& op, const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    // NOTE: Use SelectedRowsAddToTensor for better performance
+    //       no additional MergeAdd called.
+    MergeAdd<platform::CUDADeviceContext, T> merge_func;
+    auto merged_in1 = merge_func(context, input1);
+
+    auto in1_height = merged_in1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = merged_in1.value();
+    auto& in1_rows = merged_in1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.template data<T>();
+    auto* in2_data = input2->data<T>();
+
+    dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
+    dim3 grid(1, in1_rows.size());
+    UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
+        grid, threads, 0, context.stream()>>>(in1_data, in1_rows.data(), op,
+                                              in2_data, in1_row_numel);
+  }
+};
+}  // namespace scatter
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h
index d6dc6c03c941f965394d952574d309c51eb82a62..09d4631905f90f78772368ad71b11826877bdc34 100644
--- a/paddle/operators/math/selected_rows_functor.h
+++ b/paddle/operators/math/selected_rows_functor.h
@@ -12,46 +12,123 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include "paddle/framework/eigen.h"
 #include "paddle/framework/selected_rows.h"
 #include "paddle/platform/device_context.h"
 
+#define INLINE_FOR2(sizei, sizej)     \
+  for (int64_t i = 0; i < sizei; i++) \
+    for (int64_t j = 0; j < sizej; j++)
+
 namespace paddle {
 namespace operators {
 namespace math {
 
 // SelectedRows + SelectedRows will simplely concat value and rows.
 // The real computation happens in dealing with LoDTensor.
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SelectedRowsAdd {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::SelectedRows& input2,
                   framework::SelectedRows* output);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SelectedRowsAddTensor {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::Tensor& input2, framework::Tensor* output);
 };
 
 // input2 = input1 + input2
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SelectedRowsAddTo {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::SelectedRows& input1,
                   const int64_t input2_offset, framework::SelectedRows* input2);
 };
 
 // input2 = input1 + input2
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SelectedRowsAddToTensor {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  framework::Tensor* input2);
+};
+
+namespace scatter {
+// functors for manuplating SelectedRows data
+template <typename DeviceContext, typename T>
+struct MergeAdd {
+  // unary functor, merge by adding duplicated rows in
+  // the input SelectedRows object.
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input);
+};
+
+template <typename DeviceContext, typename T>
+struct Add {
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input1,
+                                     const framework::SelectedRows& input2) {
+    framework::SelectedRows out;
+    out.set_rows(input1.rows());
+    out.set_height(input1.height());
+    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+                                         context.GetPlace());
+    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
+    e_out.device(*context.eigen_device()) = e_in1 + e_in2;
+    return out;
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct Mul {
+  // multiply two SelectedRows
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input1,
+                                     const framework::SelectedRows& input2) {
+    framework::SelectedRows out;
+    out.set_rows(input1.rows());
+    out.set_height(input1.height());
+    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+                                         context.GetPlace());
+    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
+    e_out.device(*context.eigen_device()) = e_in1 * e_in2;
+    return out;
+  }
+  // multiply scalar to SelectedRows
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input1,
+                                     const T input2) {
+    framework::SelectedRows out;
+    out.set_rows(input1.rows());
+    out.set_height(input1.height());
+    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+                                         context.GetPlace());
+    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+    e_out.device(*context.eigen_device()) = input2 * e_in1;
+    return out;
+  }
+};
+
+enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
+
+// out = seleted_rows_in / tensor
+template <typename DeviceContext, typename T>
+struct UpdateToTensor {
+  void operator()(const DeviceContext& context, const ScatterOps& op,
                   const framework::SelectedRows& input1,
                   framework::Tensor* input2);
 };
 
+}  // namespace scatter
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor_test.cc b/paddle/operators/math/selected_rows_functor_test.cc
index a3649b6875aca61ee3ceb1ca83c7f9b38dc06c42..8c74cab0a1e817f9e98fa682fe4122db7837aec9 100644
--- a/paddle/operators/math/selected_rows_functor_test.cc
+++ b/paddle/operators/math/selected_rows_functor_test.cc
@@ -23,7 +23,7 @@ TEST(selected_rows_functor, cpu_add) {
 
   CPUPlace cpu_place;
   CPUDeviceContext ctx(cpu_place);
-  SetConstant<CPUPlace, float> functor;
+  SetConstant<CPUDeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -47,7 +47,7 @@ TEST(selected_rows_functor, cpu_add) {
   // simplely concat two SelectedRows
   out_value->mutable_data<float>(make_ddim({7, 10}), cpu_place);
 
-  SelectedRowsAdd<CPUPlace, float> add_functor;
+  SelectedRowsAdd<CPUDeviceContext, float> add_functor;
   add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
 
   auto out_height = output->height();
@@ -85,7 +85,7 @@ TEST(selected_rows_functor, cpu_add) {
   std::unique_ptr<Tensor> tensor2{new Tensor()};
   tensor2->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
 
-  SelectedRowsAddTensor<CPUPlace, float> add_tensor_functor;
+  SelectedRowsAddTensor<CPUDeviceContext, float> add_tensor_functor;
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
 
   auto* tensor2_data = tensor2->data<float>();
@@ -112,7 +112,7 @@ TEST(selected_rows_functor, cpu_add_to) {
 
   CPUPlace cpu_place;
   CPUDeviceContext ctx(cpu_place);
-  SetConstant<CPUPlace, float> functor;
+  SetConstant<CPUDeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -137,7 +137,7 @@ TEST(selected_rows_functor, cpu_add_to) {
   // simplely concat two SelectedRows
   out_value->mutable_data<float>(make_ddim({7, 10}), cpu_place);
 
-  SelectedRowsAddTo<CPUPlace, float> add_to_functor;
+  SelectedRowsAddTo<CPUDeviceContext, float> add_to_functor;
   add_to_functor(ctx, *selected_rows1, 0, output.get());
   add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
 
@@ -173,7 +173,7 @@ TEST(selected_rows_functor, cpu_add_to) {
   tensor1->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
-  SelectedRowsAddToTensor<CPUPlace, float> add_to_tensor_functor;
+  SelectedRowsAddToTensor<CPUDeviceContext, float> add_to_tensor_functor;
   add_to_tensor_functor(ctx, *output, tensor1.get());
 
   auto* tensor1_data = tensor1->data<float>();
diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu
index 09de9dc53a1de9537b5109b3cc7cf9744f9c7908..38808e13014c581fb3d10c3e712f1dd1fa6523e2 100644
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/operators/math/selected_rows_functor_test.cu
@@ -21,10 +21,10 @@ TEST(selected_rows_functor, gpu_add) {
   using namespace paddle::platform;
   using namespace paddle::operators::math;
 
-  GPUPlace gpu_place(0);
+  CUDAPlace gpu_place(0);
   CPUPlace cpu_place;
   CUDADeviceContext ctx(gpu_place);
-  SetConstant<GPUPlace, float> functor;
+  SetConstant<CUDADeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -48,7 +48,7 @@ TEST(selected_rows_functor, gpu_add) {
   // simplely concat two SelectedRows
   out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place);
 
-  SelectedRowsAdd<GPUPlace, float> add_functor;
+  SelectedRowsAdd<CUDADeviceContext, float> add_functor;
   add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
 
   auto out_height = output->height();
@@ -67,7 +67,7 @@ TEST(selected_rows_functor, gpu_add) {
   EXPECT_EQ(out_rows[6], 9);
 
   Tensor out_cpu;
-  out_cpu.CopyFrom(*out_value, cpu_place, ctx);
+  Copy(*out_value, cpu_place, ctx, &out_cpu);
   ctx.Wait();
 
   auto* out_cpu_data = out_cpu.data<float>();
@@ -90,11 +90,11 @@ TEST(selected_rows_functor, gpu_add) {
   std::unique_ptr<Tensor> tensor2{new Tensor()};
   tensor2->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
 
-  SelectedRowsAddTensor<GPUPlace, float> add_tensor_functor;
+  SelectedRowsAddTensor<CUDADeviceContext, float> add_tensor_functor;
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
 
   Tensor tensor2_cpu;
-  tensor2_cpu.CopyFrom(*tensor2, cpu_place, ctx);
+  Copy(*tensor2, cpu_place, ctx, &tensor2_cpu);
   ctx.Wait();
 
   auto* tensor2_cpu_data = tensor2_cpu.data<float>();
@@ -119,10 +119,10 @@ TEST(selected_rows_functor, gpu_add_to) {
   using namespace paddle::platform;
   using namespace paddle::operators::math;
 
-  GPUPlace gpu_place(0);
+  CUDAPlace gpu_place(0);
   CPUPlace cpu_place;
   CUDADeviceContext ctx(gpu_place);
-  SetConstant<GPUPlace, float> functor;
+  SetConstant<CUDADeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -147,7 +147,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   // simplely concat two SelectedRows
   out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place);
 
-  SelectedRowsAddTo<GPUPlace, float> add_to_functor;
+  SelectedRowsAddTo<CUDADeviceContext, float> add_to_functor;
   add_to_functor(ctx, *selected_rows1, 0, output.get());
   add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
 
@@ -167,7 +167,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   EXPECT_EQ(out_rows[6], 9);
 
   Tensor out_cpu;
-  out_cpu.CopyFrom(*out_value, cpu_place, ctx);
+  Copy(*out_value, cpu_place, ctx, &out_cpu);
   ctx.Wait();
 
   auto* out_cpu_data = out_cpu.data<float>();
@@ -187,11 +187,11 @@ TEST(selected_rows_functor, gpu_add_to) {
   tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
-  SelectedRowsAddToTensor<GPUPlace, float> add_to_tensor_functor;
+  SelectedRowsAddToTensor<CUDADeviceContext, float> add_to_tensor_functor;
   add_to_tensor_functor(ctx, *output, tensor1.get());
 
   Tensor tensor1_cpu;
-  tensor1_cpu.CopyFrom(*tensor1, cpu_place, ctx);
+  Copy(*tensor1, cpu_place, ctx, &tensor1_cpu);
   ctx.Wait();
 
   auto* tensor1_cpu_data = tensor1_cpu.data<float>();
diff --git a/paddle/operators/math/sequence2batch.cc b/paddle/operators/math/sequence2batch.cc
index 5b3bde02fbf981772759caa3d0054fac4a8520f9..e459a42ca251a9fc79f745f48a118ce898a0f77e 100644
--- a/paddle/operators/math/sequence2batch.cc
+++ b/paddle/operators/math/sequence2batch.cc
@@ -13,15 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/math/sequence2batch.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
 template <typename T>
-class CopyMatrixRowsFunctor<platform::CPUPlace, T> {
+class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& src, const size_t* index,
                   framework::Tensor& dst, bool is_src_index) {
     auto src_dims = src.dims();
@@ -48,13 +49,13 @@ class CopyMatrixRowsFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class CopyMatrixRowsFunctor<platform::CPUPlace, float>;
-template class CopyMatrixRowsFunctor<platform::CPUPlace, double>;
+template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, float>;
+template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, double>;
 
-template class LoDTensor2BatchFunctor<platform::CPUPlace, float>;
-template class LoDTensor2BatchFunctor<platform::CPUPlace, double>;
-template class Batch2LoDTensorFunctor<platform::CPUPlace, float>;
-template class Batch2LoDTensorFunctor<platform::CPUPlace, double>;
+template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, float>;
+template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, double>;
+template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, float>;
+template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu
index c5d968aeb216bbb3e0e17f138b9e891494d99f75..452ae8951000872b706f7e4227a62dbf98109e7e 100644
--- a/paddle/operators/math/sequence2batch.cu
+++ b/paddle/operators/math/sequence2batch.cu
@@ -39,9 +39,9 @@ __global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index,
 }
 
 template <typename T>
-class CopyMatrixRowsFunctor<platform::GPUPlace, T> {
+class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& src, const size_t* index,
                   framework::Tensor& dst, bool is_src_index) {
     auto src_dims = src.dims();
@@ -59,20 +59,19 @@ class CopyMatrixRowsFunctor<platform::GPUPlace, T> {
 
     dim3 threads(128, 8);
     dim3 grid(8, 1);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    auto stream = context.stream();
     CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
         src_data, dst_data, index, height, width, is_src_index);
   }
 };
 
-template class CopyMatrixRowsFunctor<platform::GPUPlace, float>;
-template class CopyMatrixRowsFunctor<platform::GPUPlace, double>;
+template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, float>;
+template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, double>;
 
-template class LoDTensor2BatchFunctor<platform::GPUPlace, float>;
-template class LoDTensor2BatchFunctor<platform::GPUPlace, double>;
-template class Batch2LoDTensorFunctor<platform::GPUPlace, float>;
-template class Batch2LoDTensorFunctor<platform::GPUPlace, double>;
+template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, float>;
+template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, double>;
+template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, float>;
+template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/sequence2batch.h b/paddle/operators/math/sequence2batch.h
index 73295ddbcb73fe80be08e732790f0ec75e94b415..a5c43a2c7d4d729c35a20a27de2a23141e6019bc 100644
--- a/paddle/operators/math/sequence2batch.h
+++ b/paddle/operators/math/sequence2batch.h
@@ -26,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CopyMatrixRowsFunctor {
  public:
   // If is_src_index is true,
@@ -34,12 +34,12 @@ class CopyMatrixRowsFunctor {
   // If is_src_index is false,
   // copy the input src to the indexed rows of output dst.
   // The indexed rows are based on the input index.
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& src, const size_t* index,
-                  framework::Tensor& dst, bool is_src_index);
+  void operator()(const DeviceContext& context, const framework::Tensor& src,
+                  const size_t* index, framework::Tensor& dst,
+                  bool is_src_index);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LoDTensor2BatchFunctor {
   // Calculate the length of each sequence and
   // sort sequence index by the length.
@@ -56,7 +56,7 @@ class LoDTensor2BatchFunctor {
   };
 
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::LoDTensor& lod_tensor,
                   framework::LoDTensor& batch, bool is_cal_batch_lod,
                   bool is_reverse = false) const {
@@ -65,7 +65,7 @@ class LoDTensor2BatchFunctor {
       PADDLE_ENFORCE_GT(lods.size(), 2UL);
       PADDLE_ENFORCE_EQ(lods[1].size(),
                         static_cast<size_t>(lod_tensor.dims()[0]));
-      CopyMatrixRowsFunctor<Place, T> to_batch;
+      CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
       to_batch(context, lod_tensor, lods[1].data(), batch, true);
       return;
     }
@@ -143,22 +143,22 @@ class LoDTensor2BatchFunctor {
     }
     batch.set_lod(batch_lods);
 
-    CopyMatrixRowsFunctor<Place, T> to_batch;
+    CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
     to_batch(context, lod_tensor, seq2batch_idx, batch, true);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class Batch2LoDTensorFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::LoDTensor& batch,
                   framework::LoDTensor& lod_tensor) const {
     auto in_lod = batch.lod();
     PADDLE_ENFORCE_GT(in_lod.size(), 2UL);
     PADDLE_ENFORCE_EQ(in_lod[1].size(),
                       static_cast<size_t>(lod_tensor.dims()[0]));
-    CopyMatrixRowsFunctor<Place, T> to_seq;
+    CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
     size_t* index = in_lod[1].data();
     to_seq(context, batch, index, lod_tensor, false);
   }
diff --git a/paddle/operators/math/sequence_padding.cc b/paddle/operators/math/sequence_padding.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fd66455eaef60209b9ca334480951a9f7687729b
--- /dev/null
+++ b/paddle/operators/math/sequence_padding.cc
@@ -0,0 +1,144 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_padding.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& seq, framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The LoD of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequence_length, num_sequences, sequence_width].");
+
+    const size_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be the "
+                      "maximum length of all sequences in LoDTensor seq.");
+
+    const size_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be the "
+                      "number of sequences in LoDTensor seq.");
+
+    const size_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    const T* seq_data = seq.data<T>();
+    T* padding_data = padding.data<T>();
+    for (size_t i = 0; i < max_sequence_length; ++i) {
+      for (size_t j = 0; j < num_sequences; ++j) {
+        size_t start_pos = abs_offset_lod[level][j];
+        size_t sequence_length = abs_offset_lod[level][j + 1] - start_pos;
+        if (i < sequence_length) {
+          // i > 0 => sequence_length > 0
+          T scale =
+              norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
+          for (size_t k = 0; k < sequence_width; ++k) {
+            padding_data[(i * num_sequences + j) * sequence_width + k] =
+                seq_data[(start_pos + i) * sequence_width + k] * scale;
+          }
+        } else {
+          memset(padding_data + (i * num_sequences + j) * sequence_width, 0,
+                 sequence_width * sizeof(T));
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  framework::LoDTensor& seq, const framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The LoD of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequnece_length, num_sequences, sequence_width].");
+
+    const size_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be "
+                      "the maximum length of all sequences in LoDTensor seq.");
+
+    const size_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be "
+                      "the number of sequences in LoDTensor seq.");
+
+    const size_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    const T* padding_data = padding.data<T>();
+    T* seq_data = seq.data<T>();
+    for (size_t i = 0; i < num_sequences; ++i) {
+      size_t start_pos = abs_offset_lod[level][i];
+      size_t sequence_length = abs_offset_lod[level][i + 1] - start_pos;
+      for (size_t j = 0; j < sequence_length; ++j) {
+        // sequence_width > j > 0
+        T scale =
+            norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
+        for (size_t k = 0; k < sequence_width; ++k) {
+          seq_data[(start_pos + j) * sequence_width + k] =
+              padding_data[(j * num_sequences + i) * sequence_width + k] *
+              scale;
+        }
+      }
+    }
+  }
+};
+
+template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
+template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_padding.cu b/paddle/operators/math/sequence_padding.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e4be178f81581dea2e84cf488b01d5f7f4cc0030
--- /dev/null
+++ b/paddle/operators/math/sequence_padding.cu
@@ -0,0 +1,209 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_padding.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, bool NormByTimes, bool Padding>
+__global__ void SequencePaddingKernel(T* padding, T* sequence,
+                                      const size_t* sequence_start_positions,
+                                      const size_t sequence_width,
+                                      const size_t max_sequence_length,
+                                      const size_t num_sequences) {
+  size_t padding_idx = blockIdx.y;
+  size_t start_pos = sequence_start_positions[padding_idx];
+  size_t sequence_length =
+      sequence_start_positions[padding_idx + 1] - start_pos;
+
+  size_t sequence_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  size_t padding_base_idx =
+      (sequence_idx * num_sequences + padding_idx) * sequence_width;
+  size_t sequence_base_idx = (start_pos + sequence_idx) * sequence_width;
+
+  if (sequence_idx < sequence_length) {
+    T scale = NormByTimes ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
+    if (Padding) {
+      /* sequence -> padding */
+      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
+        padding[padding_base_idx + i] = scale * sequence[sequence_base_idx + i];
+      }
+    } else {
+      /* padding -> sequence */
+      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
+        sequence[sequence_base_idx + i] = scale * padding[padding_base_idx + i];
+      }
+    }
+  } else if (sequence_idx < max_sequence_length) {
+    if (Padding) {
+      /* sequence -> padding */
+      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
+        padding[padding_base_idx + i] = 0;
+      }
+    }
+  }
+}
+
+template <typename T>
+class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::LoDTensor& seq, framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The lod of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequence_length, num_sequences, sequence_width].");
+
+    size_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be the "
+                      "maximum length of all sequences in LoDTensor seq.");
+
+    const size_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be the "
+                      "number of sequences in LoDTensor seq.");
+
+    const size_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    if (!norm_by_times && num_sequences == 1UL) {
+      Copy(seq, context.GetPlace(), context, &padding);
+      padding.Resize(padding_dims);
+      return;
+    }
+
+    const size_t kBlockSize = 512;
+
+    /* At least use 32 threads to copy sequence_width elements,
+     * and at least 8 elements for each thread.
+     */
+    size_t block_dim_x =
+        std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+    size_t block_dim_y = kBlockSize / block_dim_x;
+    dim3 threads(block_dim_x, block_dim_y);
+
+    size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
+    size_t grid_dim_y = num_sequences;
+    dim3 grid(grid_dim_x, grid_dim_y);
+
+    const T* seq_data = seq.data<T>();
+    T* padding_data = padding.data<T>();
+    if (norm_by_times) {
+      SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
+          padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(),
+          sequence_width, max_sequence_length, num_sequences);
+    } else {
+      SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
+          padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(),
+          sequence_width, max_sequence_length, num_sequences);
+    }
+  }
+};
+
+template <typename T>
+class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  framework::LoDTensor& seq, const framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The lod of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequnece_length, num_sequences, sequence_width].");
+
+    size_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be "
+                      "the maximum length of all sequences in LoDTensor seq.");
+
+    const size_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be "
+                      "the number of sequences in LoDTensor seq.");
+
+    const size_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    if (!norm_by_times && num_sequences == 1UL) {
+      Copy(padding, context.GetPlace(), context, &seq);
+      seq.Resize(seq_dims);
+      return;
+    }
+
+    const size_t kBlockSize = 512;
+
+    /* At least use 32 threads to copy sequence_width elements,
+     * and at least 8 elements for each thread.
+     */
+    size_t block_dim_x =
+        std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+    size_t block_dim_y = kBlockSize / block_dim_x;
+    dim3 threads(block_dim_x, block_dim_y);
+
+    size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
+    size_t grid_dim_y = num_sequences;
+    dim3 grid(grid_dim_x, grid_dim_y);
+
+    const T* padding_data = padding.data<T>();
+    T* seq_data = seq.data<T>();
+    if (norm_by_times) {
+      SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
+          const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(),
+          sequence_width, max_sequence_length, num_sequences);
+    } else {
+      SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
+          const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(),
+          sequence_width, max_sequence_length, num_sequences);
+    }
+  }
+};
+
+template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
+template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_padding.h b/paddle/operators/math/sequence_padding.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f586c5eb469ea260dbdf9cc4e7f9b4b4a46a8cc
--- /dev/null
+++ b/paddle/operators/math/sequence_padding.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+inline static size_t MaximumSequenceLength(const framework::LoD& lod,
+                                           const size_t level) {
+  const size_t num_sequences = lod[level].size() - 1;
+  size_t max_sequence_length = 0;
+  framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+  for (size_t i = 0; i < num_sequences; ++i) {
+    max_sequence_length =
+        std::max(max_sequence_length,
+                 abs_offset_lod[level][i + 1] - abs_offset_lod[level][i]);
+  }
+  return max_sequence_length;
+}
+
+/*
+ * \brief   Padding/Unpadding LoDTensor to/from normal Tensor of the shape
+ *          [max_sequence_length, num_sequences, sequence_width].
+ *
+ *  Padding sequence:
+ *        padding[i] = seq[lod[level][i]]
+ *  Unpadding sequence:
+ *        seq[lod[level][i]] = padding[i]
+ *
+ *  All sequences will be padded to the same length and stored in a transposed
+ * shape.
+ *  Example:
+ *    seq     (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
+ *    padding (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
+ *
+ * \param context       device context of this functor.
+ * \param seq           LoDTensor which is stored in sequence format, the shape
+ *                      is [total_sequence_length, sequence_width] where
+ *                      total_sequence_length is the sum of all sequences'
+ *                      length.
+ * \param padding       Tensor which is padded to the same length, the shape is
+ *                      [max_sequence_length, num_sequences, sequence_width].
+ * \param norm_by_times whether dividing sequence's length.
+ *
+ * \note  transposition is also done in this functor.
+ */
+template <typename DeviceContext, typename T>
+class PaddingLoDTensorFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::LoDTensor& seq,
+                  framework::Tensor& padding, bool norm_by_times);
+};
+
+template <typename DeviceContext, typename T>
+class UnpaddingLoDTensorFunctor {
+ public:
+  void operator()(const DeviceContext& context, framework::LoDTensor& seq,
+                  const framework::Tensor& padding, bool norm_by_times);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_padding_test.cc b/paddle/operators/math/sequence_padding_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9799bcd65dc65d5741813374c68a2640eaf4556c
--- /dev/null
+++ b/paddle/operators/math/sequence_padding_test.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_padding.h"
+#include <gtest/gtest.h>
+
+template <typename DeviceContext, typename Place, typename T>
+void TestSequencePadding(const paddle::framework::LoD& lod,
+                         const size_t sequence_width) {
+  paddle::framework::LoDTensor cpu_seq;
+  paddle::framework::LoDTensor cpu_seq_back;
+  paddle::framework::LoDTensor seq;
+  paddle::framework::LoDTensor seq_back;
+  paddle::framework::Tensor padding;
+
+  const size_t level = lod.size() - 1;
+  auto seq_dims =
+      paddle::framework::make_ddim({static_cast<int64_t>(lod[level].back()),
+                                    static_cast<int64_t>(sequence_width)});
+
+  cpu_seq.set_lod(lod);
+  cpu_seq.mutable_data<T>(seq_dims, paddle::platform::CPUPlace());
+  for (size_t i = 0; i < cpu_seq.numel(); ++i) {
+    cpu_seq.data<T>()[i] = static_cast<T>(i);
+  }
+
+  auto* place = new Place();
+  DeviceContext* context = new DeviceContext(*place);
+  if (paddle::platform::is_cpu_place(*place)) {
+    seq = cpu_seq;
+  } else {
+    Copy(cpu_seq, *place, *context, &seq);
+    seq.set_lod(lod);
+  }
+
+  const size_t max_sequence_length =
+      paddle::operators::math::MaximumSequenceLength(lod, level);
+  const size_t num_sequences = lod[level].size() - 1;
+  auto padding_dims =
+      paddle::framework::make_ddim({static_cast<int64_t>(max_sequence_length),
+                                    static_cast<int64_t>(num_sequences),
+                                    static_cast<int64_t>(sequence_width)});
+  padding.mutable_data<T>(padding_dims, *place);
+  paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
+      *context, seq, padding, false);
+
+  seq_back.set_lod(lod);
+  seq_back.mutable_data<T>(seq_dims, *place);
+  paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
+      *context, seq_back, padding, false);
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    cpu_seq_back = seq_back;
+  } else {
+    Copy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back);
+    cpu_seq_back.set_lod(lod);
+  }
+
+  EXPECT_EQ(cpu_seq.numel(), cpu_seq_back.numel());
+  EXPECT_EQ(cpu_seq.dims(), cpu_seq_back.dims());
+  for (size_t i = 0; i < cpu_seq.numel(); ++i) {
+    EXPECT_EQ(cpu_seq.data<T>()[i], cpu_seq_back.data<T>()[i]);
+  }
+
+  delete place;
+  delete context;
+};
+
+TEST(Seq2BatchPadding, CPU) {
+  paddle::framework::LoD lod1;
+  lod1.push_back(std::vector<size_t>{0, 10});
+  TestSequencePadding<paddle::platform::CPUDeviceContext,
+                      paddle::platform::CPUPlace, float>(lod1, 16);
+
+  paddle::framework::LoD lod2;
+  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  TestSequencePadding<paddle::platform::CPUDeviceContext,
+                      paddle::platform::CPUPlace, float>(lod2, 128);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(SequencePadding, CUDA) {
+  paddle::framework::LoD lod1;
+  lod1.push_back(std::vector<size_t>{0, 10});
+  TestSequencePadding<paddle::platform::CUDADeviceContext,
+                      paddle::platform::CUDAPlace, float>(lod1, 16);
+
+  paddle::framework::LoD lod2;
+  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  TestSequencePadding<paddle::platform::CUDADeviceContext,
+                      paddle::platform::CUDAPlace, float>(lod2, 128);
+}
+#endif
diff --git a/paddle/operators/math/sequence_pooling.cc b/paddle/operators/math/sequence_pooling.cc
index 5913c99fdb01100d0de44ab317124550fa626528..8fb92b1a130b8f25163d856f3f596136072180cf 100644
--- a/paddle/operators/math/sequence_pooling.cc
+++ b/paddle/operators/math/sequence_pooling.cc
@@ -20,9 +20,9 @@ namespace operators {
 namespace math {
 
 template <typename T>
-class MaxSeqPoolFunctor<platform::CPUPlace, T> {
+class MaxSeqPoolFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::LoDTensor& input, framework::Tensor* output,
                   framework::Tensor* index) {
     auto in_dims = input.dims();
@@ -60,9 +60,9 @@ class MaxSeqPoolFunctor<platform::CPUPlace, T> {
 };
 
 template <typename T>
-class MaxSeqPoolGradFunctor<platform::CPUPlace, T> {
+class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& out_grad,
                   const framework::Tensor& index,
                   framework::LoDTensor* in_grad) {
@@ -80,7 +80,7 @@ class MaxSeqPoolGradFunctor<platform::CPUPlace, T> {
     const int* max_index = index.data<int>();
     T* ig_data = in_grad->data<T>();
 
-    SetConstant<platform::CPUPlace, T> set_zero;
+    SetConstant<platform::CPUDeviceContext, T> set_zero;
     set_zero(context, in_grad, static_cast<T>(0.0));
     int64_t num_seq = og_dims[0];
     int64_t dim = out_grad.numel() / num_seq;
@@ -93,10 +93,10 @@ class MaxSeqPoolGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class MaxSeqPoolFunctor<platform::CPUPlace, float>;
-template class MaxSeqPoolFunctor<platform::CPUPlace, double>;
-template class MaxSeqPoolGradFunctor<platform::CPUPlace, float>;
-template class MaxSeqPoolGradFunctor<platform::CPUPlace, double>;
+template class MaxSeqPoolFunctor<platform::CPUDeviceContext, float>;
+template class MaxSeqPoolFunctor<platform::CPUDeviceContext, double>;
+template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/sequence_pooling.cu b/paddle/operators/math/sequence_pooling.cu
index 5ed951402fecba66a8960f4d024bf3785dac51c7..4c9e6b375ce7251747b9cd443d86cca0858c84ef 100644
--- a/paddle/operators/math/sequence_pooling.cu
+++ b/paddle/operators/math/sequence_pooling.cu
@@ -46,9 +46,9 @@ __global__ void KeMaxSequencePool(const T* input, const size_t* starts,
 }
 
 template <typename T>
-class MaxSeqPoolFunctor<platform::GPUPlace, T> {
+class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::LoDTensor& input, framework::Tensor* output,
                   framework::Tensor* index) {
     auto in_dims = input.dims();
@@ -71,8 +71,7 @@ class MaxSeqPoolFunctor<platform::GPUPlace, T> {
 
     dim3 threads(256, 1);
     dim3 grid(num_seq, 1);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    auto stream = context.stream();
     KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
         in_data, starts.data(), out_data, max_index, num_seq, dim);
   }
@@ -91,9 +90,9 @@ __global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index,
 }
 
 template <typename T>
-class MaxSeqPoolGradFunctor<platform::GPUPlace, T> {
+class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& out_grad,
                   const framework::Tensor& index,
                   framework::LoDTensor* in_grad) {
@@ -111,7 +110,7 @@ class MaxSeqPoolGradFunctor<platform::GPUPlace, T> {
     const int* max_index = index.data<int>();
     T* ig_data = in_grad->data<T>();
 
-    SetConstant<platform::GPUPlace, T> set_zero;
+    SetConstant<platform::CUDADeviceContext, T> set_zero;
     set_zero(context, in_grad, static_cast<T>(0.0));
     int64_t num_seq = og_dims[0];
     int64_t dim = out_grad.numel() / num_seq;
@@ -119,17 +118,16 @@ class MaxSeqPoolGradFunctor<platform::GPUPlace, T> {
     unsigned int blocks = (num_seq * dim + 128 - 1) / 128;
     dim3 threads(128, 1);
     dim3 grid(blocks, 1);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    auto stream = context.stream();
     KeMaxSequencePoolGrad<T><<<grid, threads, 0, stream>>>(
         og_data, max_index, ig_data, num_seq, dim);
   }
 };
 
-template class MaxSeqPoolFunctor<platform::GPUPlace, float>;
-template class MaxSeqPoolFunctor<platform::GPUPlace, double>;
-template class MaxSeqPoolGradFunctor<platform::GPUPlace, float>;
-template class MaxSeqPoolGradFunctor<platform::GPUPlace, double>;
+template class MaxSeqPoolFunctor<platform::CUDADeviceContext, float>;
+template class MaxSeqPoolFunctor<platform::CUDADeviceContext, double>;
+template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/sequence_pooling.h b/paddle/operators/math/sequence_pooling.h
index 35dfe26de1a87a064410401244914d4e2a94176e..13ffb2ebef3a683b5e5fe64433a90237b944002e 100644
--- a/paddle/operators/math/sequence_pooling.h
+++ b/paddle/operators/math/sequence_pooling.h
@@ -23,18 +23,18 @@ namespace math {
 
 #define FLT_MAX __FLT_MAX__
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MaxSeqPoolFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::LoDTensor& input, framework::Tensor* output,
                   framework::Tensor* index);
 };
 
-template <typename Place, class T>
+template <typename DeviceContext, class T>
 class MaxSeqPoolGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::Tensor& out_grad,
                   const framework::Tensor& index,
                   framework::LoDTensor* in_grad);
diff --git a/paddle/operators/math/sequence_scale.cc b/paddle/operators/math/sequence_scale.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e439e9a2cebaa5d494b185fd878e293a6895e45
--- /dev/null
+++ b/paddle/operators/math/sequence_scale.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_scale.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  framework::LoDTensor& seq, const T* scales) {
+    const size_t level = 0;
+    auto lod = seq.lod();
+    const size_t num_seq = lod[level].size() - 1;
+    size_t seq_width = seq.dims()[1];
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    T* seq_data = seq.mutable_data<T>(context.GetPlace());
+    for (size_t i = 0; i < num_seq; ++i) {
+      for (size_t j = lod[level][i] * seq_width;
+           j < lod[level][i + 1] * seq_width; ++j) {
+        seq_data[j] *= scales[i];
+      }
+    }
+  }
+};
+
+template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_scale.cu b/paddle/operators/math/sequence_scale.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ceaabd8e0fd81c927fbd4333c0aa7954b8da8513
--- /dev/null
+++ b/paddle/operators/math/sequence_scale.cu
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_scale.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, int BlockSize>
+__global__ void SequenceScaleKernel(T* seq, size_t* lod, const T* scales,
+                                    const size_t seq_width) {
+  for (int i = threadIdx.x;
+       i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * seq_width;
+       i += BlockSize) {
+    int idx = lod[blockIdx.x] * seq_width + i;
+    seq[idx] *= scales[blockIdx.x];
+  }
+}
+
+template <typename T>
+class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  framework::LoDTensor& seq, const T* scales) {
+    const size_t level = 0;
+    auto lod = seq.lod();
+    const size_t num_seq = lod[level].size() - 1;
+    const size_t seq_width = seq.numel() / seq.dims()[0];
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+    T* seq_data = seq.mutable_data<T>(context.GetPlace());
+
+    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
+        num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
+        seq_data, abs_offset_lod[level].data(), scales, seq_width);
+  }
+};
+
+template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_scale.h b/paddle/operators/math/sequence_scale.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecd9a57c3f4d8d91bfb8933a0fd38355c227744d
--- /dev/null
+++ b/paddle/operators/math/sequence_scale.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * \brief   Scale a sequence.
+ *
+ *  All sequences will be padded to the same length and stored in a transposed
+ * shape.
+ *  Example:
+ *    Given:
+ *      seq = (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
+ *      scales = (2, 3, 4, 5)
+ *    then:
+ *      result = (2*s0, 2*s0, 2*s0, 2*s0; 3*s1, 3*s1; 4*s2, 4*s2, 4*s2; 5*s3)
+
+ *
+ * \param context       Device context of this functor.
+ * \param seq           LoDTensor which is stored in sequence format, the shape
+ *                      is [total_sequence_length, sequence_width] where
+ *                      total_sequence_length is the sum of all sequences'
+ *                      length.
+ * \param scales        Array<T>. The i-th sequence will be scaled by scales[i].
+ * \param num_seq       Number of sequence
+ *
+ */
+template <typename DeviceContext, typename T>
+class ScaleLoDTensorFunctor {
+ public:
+  void operator()(const DeviceContext& context, framework::LoDTensor& seq,
+                  const T* scales);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/softmax.cc b/paddle/operators/math/softmax.cc
index 3e2f15d6c27f58818128f32fab0bd4c5f36b0050..72f10f35f4ef39b41fbc5e900313eafd7ba669e9 100644
--- a/paddle/operators/math/softmax.cc
+++ b/paddle/operators/math/softmax.cc
@@ -19,10 +19,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class SoftmaxFunctor<platform::CPUPlace, float>;
-template class SoftmaxFunctor<platform::CPUPlace, double>;
-template class SoftmaxGradFunctor<platform::CPUPlace, float>;
-template class SoftmaxGradFunctor<platform::CPUPlace, double>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, float>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, double>;
+template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
+template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/softmax.cu b/paddle/operators/math/softmax.cu
index 4dbab51d46bdaaa506a6c242d0958c73687f4eb9..9e73f6a371c950ed6f81ee90216f7fd3899f73ce 100644
--- a/paddle/operators/math/softmax.cu
+++ b/paddle/operators/math/softmax.cu
@@ -21,10 +21,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class SoftmaxFunctor<platform::GPUPlace, float>;
-template class SoftmaxFunctor<platform::GPUPlace, double>;
-template class SoftmaxGradFunctor<platform::GPUPlace, float>;
-template class SoftmaxGradFunctor<platform::GPUPlace, double>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
+template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
+template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/softmax.h b/paddle/operators/math/softmax.h
index fe1074650234c5beb5889e7efd713164769ad740..471f44d340cfd0d6305a9127c34289ef1663accb 100644
--- a/paddle/operators/math/softmax.h
+++ b/paddle/operators/math/softmax.h
@@ -19,19 +19,18 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SoftmaxFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor* X, framework::Tensor* Y);
+  void operator()(const DeviceContext& context, const framework::Tensor* X,
+                  framework::Tensor* Y);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SoftmaxGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor* y, const framework::Tensor* y_grad,
-                  framework::Tensor* x_grad);
+  void operator()(const DeviceContext& context, const framework::Tensor* y,
+                  const framework::Tensor* y_grad, framework::Tensor* x_grad);
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/softmax_impl.h b/paddle/operators/math/softmax_impl.h
index 05793eeb3eeafaf36c301236197555b7b35e5803..82f597ff792decb1760f59e693026cd453432d05 100644
--- a/paddle/operators/math/softmax_impl.h
+++ b/paddle/operators/math/softmax_impl.h
@@ -32,10 +32,10 @@ struct ValueClip {
   }
 };
 
-template <typename Place, typename T>
-void SoftmaxFunctor<Place, T>::operator()(
-    const platform::DeviceContext& context, const framework::Tensor* X,
-    framework::Tensor* Y) {
+template <typename DeviceContext, typename T>
+void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+                                                  const framework::Tensor* X,
+                                                  framework::Tensor* Y) {
   auto logits = EigenMatrix<T>::From(*X);
   auto softmax = EigenMatrix<T>::From(*Y);
 
@@ -56,19 +56,18 @@ void SoftmaxFunctor<Place, T>::operator()(
                              .broadcast(one_by_class))
                             .unaryExpr(ValueClip<T>());
 
-  softmax.device(*context.GetEigenDevice<Place>()) = shifted_logits.exp();
-  softmax.device(*context.GetEigenDevice<Place>()) =
-      (softmax *
-       softmax.sum(along_class)
-           .inverse()
-           .eval()
-           .reshape(batch_by_one)
-           .broadcast(one_by_class));
+  softmax.device(*context.eigen_device()) = shifted_logits.exp();
+  softmax.device(*context.eigen_device()) = (softmax *
+                                             softmax.sum(along_class)
+                                                 .inverse()
+                                                 .eval()
+                                                 .reshape(batch_by_one)
+                                                 .broadcast(one_by_class));
 }
 
-template <typename Place, typename T>
-void SoftmaxGradFunctor<Place, T>::operator()(
-    const platform::DeviceContext& context, const framework::Tensor* y,
+template <typename DeviceContext, typename T>
+void SoftmaxGradFunctor<DeviceContext, T>::operator()(
+    const DeviceContext& context, const framework::Tensor* y,
     const framework::Tensor* y_grad, framework::Tensor* x_grad) {
   auto softmax = EigenMatrix<T>::From(*y);
   auto softmax_grad = EigenMatrix<T>::From(*y_grad);
@@ -89,8 +88,7 @@ void SoftmaxGradFunctor<Place, T>::operator()(
                  .eval()
                  .reshape(batch_by_one)
                  .broadcast(one_by_class);
-  logits_grad.device(*context.GetEigenDevice<Place>()) =
-      (softmax_grad - dot) * softmax;
+  logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax;
 }
 
 }  // namespace math
diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ecd3a647e00655a57d11c2f082bd1f81822cf92b
--- /dev/null
+++ b/paddle/operators/math/unpooling.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/unpooling.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+class Unpool2dMaxFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          output_data[index] = input_data[i];
+        }
+        input_data += input_feasize;
+        indices_data += input_feasize;
+        output_data += output_feasize;
+      }
+    }
+  }
+};
+template <class T>
+class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const int* indices_data = indices.data<int>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          input_grad_data[i] = output_grad_data[index];
+        }
+        input_grad_data += input_feasize;
+        indices_data += input_feasize;
+        output_grad_data += output_feasize;
+      }
+    }
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, float>;
+template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, double>;
+template class Unpool2dMaxFunctor<platform::CPUDeviceContext, float>;
+template class Unpool2dMaxFunctor<platform::CPUDeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ecbde0f6a798ba817c28714b37af8187d2e9555e
--- /dev/null
+++ b/paddle/operators/math/unpooling.cu
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/unpooling.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
+                                  const int* indices_data,
+                                  const int input_height, const int input_width,
+                                  const int channels, T* output_data,
+                                  const int output_height,
+                                  const int output_width) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    output_data[out_offset + out_index] = input_data[i];
+  }
+}
+template <typename T>
+__global__ void KernelUnpool2dMaxGrad(
+    const int nthreads, const T* input_data, const int* indices_data,
+    const int input_height, const int input_width, const int channels,
+    const T* output_data, const T* output_grad, const int output_height,
+    const int output_width, T* input_grad) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    input_grad[i] = output_grad[out_offset + out_index];
+  }
+}
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMax<T><<<grid, threads, 0, context.stream()>>>(
+        input.numel(), input_data, indices_data, input_height, input_width,
+        output_channels, output_data, output_height, output_width);
+  }
+};
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMaxGrad<T><<<grid, threads, 0, context.stream()>>>(
+        input.numel(), input_data, indices_data, input_height, input_width,
+        output_channels, output_data, output_grad_data, output_height,
+        output_width, input_grad_data);
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, float>;
+template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, double>;
+template class Unpool2dMaxFunctor<platform::CUDADeviceContext, float>;
+template class Unpool2dMaxFunctor<platform::CUDADeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f0ff1371ebea8c7501aee1c7c45bc6a79de397e
--- /dev/null
+++ b/paddle/operators/math/unpooling.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename DeviceContext, typename T>
+class Unpool2dMaxFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output);
+};
+template <typename DeviceContext, class T>
+class Unpool2dMaxGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/vol2col.cc b/paddle/operators/math/vol2col.cc
index 99eb7fd46de42400a915d86706580d15b08a74a2..d574ed9234304d992a6e4a10fce0816aee7fa40a 100644
--- a/paddle/operators/math/vol2col.cc
+++ b/paddle/operators/math/vol2col.cc
@@ -25,9 +25,9 @@ namespace math {
  *                    output_depth, output_height, output_width]
  */
 template <class T>
-class Vol2ColFunctor<platform::CPUPlace, T> {
+class Vol2ColFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& vol,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
@@ -111,9 +111,9 @@ class Vol2ColFunctor<platform::CPUPlace, T> {
  *                    output_depth, output_height, output_width]
  */
 template <class T>
-class Col2VolFunctor<platform::CPUPlace, T> {
+class Col2VolFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
@@ -190,10 +190,10 @@ class Col2VolFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class Vol2ColFunctor<platform::CPUPlace, float>;
-template class Vol2ColFunctor<platform::CPUPlace, double>;
-template class Col2VolFunctor<platform::CPUPlace, float>;
-template class Col2VolFunctor<platform::CPUPlace, double>;
+template class Vol2ColFunctor<platform::CPUDeviceContext, float>;
+template class Vol2ColFunctor<platform::CPUDeviceContext, double>;
+template class Col2VolFunctor<platform::CPUDeviceContext, float>;
+template class Col2VolFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/vol2col.cu b/paddle/operators/math/vol2col.cu
index dae3be858e9f47d0133aa37e8a5f90a0addf1dfd..b029442fe48dd27232d322aadec5864760e1b9ff 100644
--- a/paddle/operators/math/vol2col.cu
+++ b/paddle/operators/math/vol2col.cu
@@ -68,9 +68,9 @@ __global__ void vol2col(int num_kernels, const T* data_vol, int depth,
  *                    output_depth, output_height, output_width]
  */
 template <class T>
-class Vol2ColFunctor<platform::GPUPlace, T> {
+class Vol2ColFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& vol,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
@@ -117,9 +117,7 @@ class Vol2ColFunctor<platform::GPUPlace, T> {
 
     const int threads = 1024;
     const int blocks = (num_outputs + 1024 - 1) / 1024;
-    vol2col<T><<<blocks, threads, 0,
-                 reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                     .stream()>>>(
+    vol2col<T><<<blocks, threads, 0, context.stream()>>>(
         num_outputs, vol.data<T>(), input_depth, input_height, input_width,
         dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
         filter_width, strides[0], strides[1], strides[2], paddings[0],
@@ -196,9 +194,9 @@ __global__ void col2vol(int num_kernels, const T* data_col, int depth,
  *                    output_depth, output_height, output_width]
  */
 template <class T>
-class Col2VolFunctor<platform::GPUPlace, T> {
+class Col2VolFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
@@ -245,9 +243,7 @@ class Col2VolFunctor<platform::GPUPlace, T> {
     const int threads = 1024;
     const int blocks = (num_kernels + 1024 - 1) / 1024;
 
-    col2vol<T><<<blocks, threads, 0,
-                 reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                     .stream()>>>(
+    col2vol<T><<<blocks, threads, 0, context.stream()>>>(
         num_kernels, col.data<T>(), input_depth, input_height, input_width,
         dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
         filter_width, strides[0], strides[1], strides[2], paddings[0],
@@ -256,10 +252,10 @@ class Col2VolFunctor<platform::GPUPlace, T> {
   }
 };
 
-template class Vol2ColFunctor<platform::GPUPlace, float>;
-template class Vol2ColFunctor<platform::GPUPlace, double>;
-template class Col2VolFunctor<platform::GPUPlace, float>;
-template class Col2VolFunctor<platform::GPUPlace, double>;
+template class Vol2ColFunctor<platform::CUDADeviceContext, float>;
+template class Vol2ColFunctor<platform::CUDADeviceContext, double>;
+template class Col2VolFunctor<platform::CUDADeviceContext, float>;
+template class Col2VolFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/vol2col.h b/paddle/operators/math/vol2col.h
index cbc30bd754608dd6e6def1a4097d69bdf0c942c3..dcd80370e8516d34b764b1ab3b0b98516e738bf6 100644
--- a/paddle/operators/math/vol2col.h
+++ b/paddle/operators/math/vol2col.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
 #include "paddle/platform/device_context.h"
 
 namespace paddle {
@@ -62,22 +63,20 @@ namespace math {
  * \note The caller needs to ensure that volShape.inputChannels is equal to
  *       colShape.inputChannels.
  */
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class Vol2ColFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& vol,
+  void operator()(const DeviceContext& context, const framework::Tensor& vol,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
                   framework::Tensor* col) const;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class Col2VolFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& col,
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc
index c31c716842f30de67c29b803866b8c82ddcf4a41..7a308ca81403fb4ac65a7463102a91c050ab4561 100644
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <iostream>
 
-template <typename Place>
+template <typename DeviceContext, typename Place>
 void testVol2col() {
   paddle::framework::Tensor input;
   paddle::framework::Tensor input_tmp;
@@ -24,18 +24,7 @@ void testVol2col() {
   paddle::framework::Tensor output_tmp;
 
   auto* place = new Place();
-  paddle::platform::DeviceContext* context;
-  if (paddle::platform::is_cpu_place(*place)) {
-    context =
-        new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
-  } else {
-#ifdef PADDLE_WITH_CUDA
-    context =
-        new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
-#else
-    PADDLE_THROW("no GPU support");
-#endif  // PADDLE_WITH_CUDA
-  }
+  DeviceContext* context = new DeviceContext(*place);
 
   /**
    * input = [[0, 1, 2,
@@ -82,13 +71,13 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    Copy(input_tmp, *place, *context, &input);
   }
   output.mutable_data<float>({1, filter_size, filter_size, filter_size,
                               output_depth, output_height, output_width},
                              *place);
 
-  paddle::operators::math::Vol2ColFunctor<Place, float> vol2col;
+  paddle::operators::math::Vol2ColFunctor<DeviceContext, float> vol2col;
   vol2col(*context, input, dilations, strides, paddings, &output);
 
   float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
@@ -96,7 +85,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output.data<float>();
   } else {
-    output_tmp.CopyFrom(output, paddle::platform::CPUPlace(), *context);
+    Copy(output, paddle::platform::CPUPlace(), *context, &output_tmp);
     out_cfo_ptr = output_tmp.data<float>();
   }
 
@@ -110,17 +99,17 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    Copy(input_tmp, *place, *context, &input);
   }
 
-  paddle::operators::math::Col2VolFunctor<Place, float> col2vol;
+  paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol;
   col2vol(*context, output, dilations, strides, paddings, &input);
 
   float* in_ptr;
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
+    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
 
@@ -130,8 +119,9 @@ void testVol2col() {
 }
 
 TEST(math, vol2col) {
-  testVol2col<paddle::platform::CPUPlace>();
+  testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #ifdef PADDLE_WITH_CUDA
-  testVol2col<paddle::platform::GPUPlace>();
+  testVol2col<paddle::platform::CUDADeviceContext,
+              paddle::platform::CUDAPlace>();
 #endif  // PADDLE_WITH_CUDA
 }
diff --git a/paddle/operators/matmul_op.cc b/paddle/operators/matmul_op.cc
index 5a1a6154203d40186f1e41491194b19612931b1f..fd65d894d5749c97f860d614de354e89f6d9441d 100644
--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
@@ -130,7 +130,7 @@ class MatMulOp : public framework::OperatorWithKernel {
 
 class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MatMulOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MatMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The first input of MatMul op");
     AddInput("Y", "The second input of MatMul op");
@@ -206,7 +206,8 @@ class MatMulOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad,
             ops::MatMulOpGrad);
-REGISTER_OP_CPU_KERNEL(matmul,
-                       ops::MatMulKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    matmul_grad, ops::MatMulGradKernel<paddle::platform::CPUPlace, float>);
+    matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    matmul_grad,
+    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/matmul_op.cu.cc b/paddle/operators/matmul_op.cu.cc
index b7e66382f00445b087e14103e7a148d450b37405..d28d12164e493786c5bdafb157795d797ee87b91 100644
--- a/paddle/operators/matmul_op.cu.cc
+++ b/paddle/operators/matmul_op.cu.cc
@@ -1,21 +1,22 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/matmul_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(matmul,
-                       ops::MatMulKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    matmul_grad, ops::MatMulGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    matmul, ops::MatMulKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    matmul_grad,
+    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h
index 1e4aa48b7018d8e3d6f02591fbca2877ddbd3c5d..78adc64f76f45afce64c49bcf734647e0db2d6b3 100644
--- a/paddle/operators/matmul_op.h
+++ b/paddle/operators/matmul_op.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   You may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -27,7 +27,7 @@ using DDim = framework::DDim;
 using framework::make_ddim;
 using framework::vectorize;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MatMulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -38,8 +38,9 @@ class MatMulKernel : public framework::OpKernel<T> {
     bool transpose_x = context.Attr<bool>("transpose_X");
     bool transpose_y = context.Attr<bool>("transpose_Y");
 
-    math::MatMulFunctor<Place, T>()(context.device_context(), x, transpose_x, y,
-                                    transpose_y, T(1), out, T(0));
+    math::MatMulFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), x, transpose_x, y,
+        transpose_y, T(1), out, T(0));
   }
 };
 
@@ -68,17 +69,16 @@ Tensor CombineBatchAndM(const Tensor& input) {
 // Reshape a rank-3 tensor from P x M x N to M x (P * N).
 // (Warning: This requires transposing data and writes into new memory.)
 // Identity op if the tensor is not of rank 3.
-template <typename Place, typename T>
-Tensor CombineBatchAndN(const framework::ExecutionContext& context,
-                        const Tensor& input) {
+template <typename DeviceContext, typename T>
+Tensor CombineBatchAndN(const DeviceContext& context, const Tensor& input) {
   Tensor output;
   auto in_dims = input.dims();
   if (in_dims.size() == 3) {
     output.Resize({in_dims[1], in_dims[0], in_dims[2]});
     output.mutable_data<T>(context.GetPlace());
     std::vector<int> axis = {1, 0, 2};
-    math::Transpose<Place, T, 3> trans;
-    trans(context.device_context(), input, &output, axis);
+    math::Transpose<DeviceContext, T, 3> trans;
+    trans(context, input, &output, axis);
     std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
     output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
   } else {
@@ -112,7 +112,7 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context,
 //
 // To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
 // to X: (P * M) x K, dOut: (P * M) x N.
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MatMulGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -178,24 +178,23 @@ class MatMulGradKernel : public framework::OpKernel<T> {
     Tensor Y = Reshape<T>(y, make_ddim(y_dims));
     Tensor dOut = Reshape<T>(dout, make_ddim(dout_dims));
 
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (dx) {
       dx->mutable_data<T>(context.GetPlace());
       const Tensor& dOut_for_dX =
           (x_dims.size() == 2 && y_dims.size() == 3)
-              ? CombineBatchAndN<Place, T>(context, dOut)
+              ? CombineBatchAndN<DeviceContext, T>(dev_ctx, dOut)
               : dOut;
       if (x_dims.size() == 2 && y_dims.size() == 3) {
         Y = transpose_y ? CombineBatchAndM<T>(Y)
-                        : CombineBatchAndN<Place, T>(context, Y);
+                        : CombineBatchAndN<DeviceContext, T>(dev_ctx, Y);
       }
       if (transpose_x) {
-        math::MatMulFunctor<Place, T>()(context.device_context(), Y,
-                                        transpose_y, dOut_for_dX, transpose_x,
-                                        T(1), dx, T(0));
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, Y, transpose_y, dOut_for_dX, transpose_x, T(1), dx, T(0));
       } else {
-        math::MatMulFunctor<Place, T>()(context.device_context(), dOut_for_dX,
-                                        transpose_x, Y, !transpose_y, T(1), dx,
-                                        T(0));
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, dOut_for_dX, transpose_x, Y, !transpose_y, T(1), dx, T(0));
       }
     }
 
@@ -205,18 +204,16 @@ class MatMulGradKernel : public framework::OpKernel<T> {
                                       ? CombineBatchAndM<T>(dOut)
                                       : dOut;
       if (y_dims.size() == 2 && x_dims.size() == 3) {
-        X = transpose_x ? CombineBatchAndN<Place, T>(context, X)
+        X = transpose_x ? CombineBatchAndN<DeviceContext, T>(dev_ctx, X)
                         : CombineBatchAndM<T>(X);
         dOut = CombineBatchAndM<T>(dOut);
       }
       if (transpose_y) {
-        math::MatMulFunctor<Place, T>()(context.device_context(), dOut_for_dY,
-                                        transpose_y, X, transpose_x, T(1), dy,
-                                        T(0));
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, dOut_for_dY, transpose_y, X, transpose_x, T(1), dy, T(0));
       } else {
-        math::MatMulFunctor<Place, T>()(context.device_context(), X,
-                                        !transpose_x, dOut_for_dY, transpose_y,
-                                        T(1), dy, T(0));
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, X, !transpose_x, dOut_for_dY, transpose_y, T(1), dy, T(0));
       }
     }
   }
diff --git a/paddle/operators/max_sequence_len_op.cc b/paddle/operators/max_sequence_len_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..019150e4914e8bd34a5e8b7d37318aee43942fcc
--- /dev/null
+++ b/paddle/operators/max_sequence_len_op.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+class MaxSeqenceLenOp : public framework::OperatorBase {
+ public:
+  MaxSeqenceLenOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto &rank_table =
+        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    int64_t *out_ptr = out->mutable_data<int64_t>({1}, platform::CPUPlace());
+    *out_ptr = rank_table.items()[0].length;
+  }
+};
+
+class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxSeqenceLenOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("RankTable", "The lod_rank_table.");
+    AddOutput("Out", "The max sequence length.");
+    AddComment(
+        R"DOC(Calculate the max sequence length through lod_rank_table.)DOC");
+  }
+};
+
+class MaxSeqenceLenInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("RankTable"));
+    context->SetOutputDim("Out", {1});
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(max_sequence_len, paddle::operators::MaxSeqenceLenOp,
+                  paddle::operators::MaxSeqenceLenOpProtoMaker,
+                  paddle::operators::MaxSeqenceLenInferShape,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/maxout_op.cc b/paddle/operators/maxout_op.cc
index 95467f2e69093906980d075b6a41c5d2934dd5a2..3ee32269417e80cd14a6ff0f8e52c0b2dec4b8be 100644
--- a/paddle/operators/maxout_op.cc
+++ b/paddle/operators/maxout_op.cc
@@ -20,18 +20,19 @@ using framework::Tensor;
 
 class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxOutOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MaxOutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
+    AddInput(
+        "X",
         "(Tensor) The input tensor of maxout operator. "
         "The format of input tensor is NCHW. Where N is batch size, C is the "
         "number of channels, H and W is the height and width of feature.");
     AddOutput("Out",
-        "(Tensor) The output tensor of maxout operator."
-        "The format of output tensor is also NCHW."
-        "Where N is batch size, C is "
-        "the number of channels, H and W is the height and "
-        "width of feature.");
+              "(Tensor) The output tensor of maxout operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
     AddAttr<int>(
         "groups",
         R"DOC("Specifies how many groups the input tensor will be split"
@@ -39,41 +40,44 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
         "the number of channels divided by groups.."
         )DOC");
     AddComment(R"DOC(
-        Assumed the input shape is (N, Ci, H, W).
-        The output shape is (N, Co, H, W). Then `Co = Ci / groups`.
+MaxOut Operator.
 
-       math:
-       y_{si+j} = \max_k x_{gsi + sk + j}
-       g = groups
-       s = input.size / num_channels
-       0 \le i < num_channels / groups
-       0 \le j < s
-       0 \le k < groups
+Assumed the input shape is (N, Ci, H, W).
+The output shape is (N, Co, H, W).
+Then $Co = Ci / groups$ and the operator formula is as follows:
 
-    Please refer to Paper:
-      - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
-      - Multi-digit Number Recognition from Street View \
-        Imagery using Deep Convolutional Neural Networks: \
-        https://arxiv.org/pdf/1312.6082v4.pdf
-        )DOC");
+$$
+y_{si+j} = \max_k x_{gsi + sk + j} \\
+g = groups \\
+s = \frac{input.size}{num\_channels} \\
+0 \le i < \frac{num\_channels}{groups} \\
+0 \le j < s \\
+0 \le k < groups
+$$
+
+Please refer to Paper:
+  - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
+  - Multi-digit Number Recognition from Street View \
+    Imagery using Deep Convolutional Neural Networks: \
+    https://arxiv.org/pdf/1312.6082v4.pdf
+
+)DOC");
   }
 };
 
-
 class MaxOutOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MaxoutOp"
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of MaxoutOp"
                    "should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of MaxoutOp should not be null.");
     auto in_x_dims = ctx->GetInputDim("X");
     int groups = ctx->Attrs().Get<int>("groups");
     // check groups > 1
-    PADDLE_ENFORCE_GT(
-        groups, 1,
-        "groups should be larger than 1 in maxoutop");
+    PADDLE_ENFORCE_GT(groups, 1, "groups should be larger than 1 in maxoutop");
     std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1] / groups});
     output_shape.push_back(in_x_dims[2]);
     output_shape.push_back(in_x_dims[3]);
@@ -87,18 +91,18 @@ class MaxOutOpGrad : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-    "Input(X@GRAD) should not be null.");
+                   "Input(X@GRAD) should not be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
-}    // namespace operators
-}    // namespace paddle
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
-                        ops::MaxOutOpGrad);
-REGISTER_OP_CPU_KERNEL(maxout, ops::MaxOutKernel<paddle::platform::CPUPlace,
-                       float>);
-REGISTER_OP_CPU_KERNEL(maxout_grad,
-                       ops::MaxOutGradKernel<paddle::platform::CPUPlace,
-                       float>);
+            ops::MaxOutOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    maxout, ops::MaxOutKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    maxout_grad,
+    ops::MaxOutGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/maxout_op.cu.cc b/paddle/operators/maxout_op.cu.cc
index a5823fba6848a0d42a743c90d7d683e3e4ae4422..c4a2d676d3aca4d59d0bfa8c75aa0c249e202ab5 100644
--- a/paddle/operators/maxout_op.cu.cc
+++ b/paddle/operators/maxout_op.cu.cc
@@ -1,25 +1,24 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/maxout_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(maxout,
-                       ops::MaxOutKernel<paddle::platform::GPUPlace, float>,
-                       ops::MaxOutKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(maxout_grad,
-                       ops::MaxOutGradKernel<paddle::platform::GPUPlace,
-                        float>,
-                       ops::MaxOutGradKernel<paddle::platform::GPUPlace,
-                        double>);
+REGISTER_OP_CUDA_KERNEL(
+    maxout, ops::MaxOutKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MaxOutKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    maxout_grad,
+    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/maxout_op.h b/paddle/operators/maxout_op.h
index c404cd16a9b2372ea4c6a17eb5ac82cf8f3bf27c..e8b12552b9ff39e23702de17abc9825a527f02aa 100644
--- a/paddle/operators/maxout_op.h
+++ b/paddle/operators/maxout_op.h
@@ -23,7 +23,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MaxOutKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -31,12 +31,13 @@ class MaxOutKernel : public framework::OpKernel<T> {
     Tensor* out = context.Output<Tensor>("Out");
     int groups = context.template Attr<int>("groups");
 
-    math::MaxOutFunctor<Place, T> maxout_forward;
-    maxout_forward(context.device_context(), *in_x, out, groups);
+    math::MaxOutFunctor<DeviceContext, T> maxout_forward;
+    maxout_forward(context.template device_context<DeviceContext>(), *in_x, out,
+                   groups);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MaxOutGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -46,14 +47,13 @@ class MaxOutGradKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
     int groups = context.template Attr<int>("groups");
-    auto& device_ctx = context.device_context();
-    math::SetConstant<Place, T> zero;
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
       zero(device_ctx, in_x_grad, static_cast<T>(0.0));
-      math::MaxOutGradFunctor<Place, T> maxout_backward;
-      maxout_backward(context.device_context(), *in_x, in_x_grad, *out,
-        *out_grad, groups);
+      math::MaxOutGradFunctor<DeviceContext, T> maxout_backward;
+      maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups);
     }
   }
 };
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index dcc5b4286f4ac833268a779a9a7edd2ed119ffff..411f4d14efbfa5a8ee6dd7da645a044b191bf006 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -32,7 +32,7 @@ class MeanOp : public framework::OperatorWithKernel {
 
 class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MeanOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MeanOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of mean op");
     AddOutput("Out", "The output of mean op");
@@ -60,13 +60,13 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto* grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
     grad_op->SetType("mean_grad");
     grad_op->SetInput("X", Input("X"));
     grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
@@ -76,8 +76,9 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker);
 REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
-REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel<paddle::platform::CPUPlace, float>,
-                       ops::MeanKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(mean_grad,
-                       ops::MeanGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::MeanGradKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MeanKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    mean_grad, ops::MeanGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MeanGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/mean_op.cu b/paddle/operators/mean_op.cu
index ca089938c048f7aa5bd561f57c093aa74cce4e11..212d4481138c1478f6e3aa684008f9e42c5a3870 100644
--- a/paddle/operators/mean_op.cu
+++ b/paddle/operators/mean_op.cu
@@ -1,24 +1,25 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 
 #include "paddle/operators/mean_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel<paddle::platform::GPUPlace, float>,
-                       ops::MeanKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(mean_grad,
-                       ops::MeanGradKernel<paddle::platform::GPUPlace, float>,
-                       ops::MeanGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h
index c99286a5b928f1edcd845b01b21b95654c25db07..351b34595974b1771d9f4ae5232e0b3a33491104 100644
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MeanKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -38,13 +38,14 @@ class MeanKernel : public framework::OpKernel<T> {
 
     auto X = EigenVector<T>::Flatten(*input);
     auto y = EigenScalar<T>::From(*output);
-    auto& place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     y.device(place) = X.mean();
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MeanGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -56,7 +57,8 @@ class MeanGradKernel : public framework::OpKernel<T> {
     T ig_size = static_cast<T>(IG->numel());
     Eigen::DSizes<int, 1> bcast(ig_size);
 
-    EigenVector<T>::Flatten(*IG).device(context.GetEigenDevice<Place>()) =
+    EigenVector<T>::Flatten(*IG).device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
         (EigenVector<T>::From(*OG) / ig_size).broadcast(bcast);
   }
 };
diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc
index 80460c476921b63ec5228a9780880c7db3c85217..87644d316d42c7d9453a99b759214b24088062df 100644
--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -28,7 +28,11 @@ class MergeLoDTensorOp : public framework::OperatorBase {
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
     auto &in_true = scope.FindVar(Input("InTrue"))->Get<framework::LoDTensor>();
@@ -45,7 +49,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
 #ifdef PADDLE_WITH_CUDA
-      cpu_mask->CopyFrom(mask, platform::CPUPlace(), dev_ctx);
+      framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
       PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
 #endif
@@ -99,8 +103,9 @@ class MergeLoDTensorOp : public framework::OperatorBase {
       if (len == 0) {
         continue;
       }
-      out->Slice(out_offset, out_offset + len)
-          .CopyFrom(input->Slice(start_offset, end_offset), place, dev_ctx);
+      auto slice = out->Slice(out_offset, out_offset + len);
+      framework::Copy(input->Slice(start_offset, end_offset), place, dev_ctx,
+                      &slice);
       out_offset += len;
       (*in_idx) += 1;
     }
@@ -113,8 +118,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {
 
 class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MergeLoDTensorOpProtoMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+  MergeLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input LoDTensor, contains complete lod information to "
@@ -161,15 +165,15 @@ class MergeLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("split_lod_tensor");
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetInput("Mask", Input("Mask"));
     grad_op->SetOutput("OutTrue", InputGrad("InTrue"));
     grad_op->SetOutput("OutFalse", InputGrad("InFalse"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index 4684c20208501a3239fd57b35428946bb52af4a0..3d7742dd4bc2a3c727279bc1e6c7dd47b96eefa3 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/minus_op.h"
 #include "paddle/operators/net_op.h"
@@ -46,7 +46,7 @@ class MinusOp : public framework::OperatorWithKernel {
 
 class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MinusOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The left tensor of minus operator.");
     AddInput("Y", "The right tensor of minus operator.");
@@ -70,12 +70,11 @@ class MinusGradMaker : public framework::GradOpDescMakerBase {
  public:
   using framework::GradOpDescMakerBase::GradOpDescMakerBase;
 
-  std::vector<std::unique_ptr<framework::OpDescBind>> operator()()
-      const override {
-    std::vector<std::unique_ptr<framework::OpDescBind>> ops;
+  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
+    std::vector<std::unique_ptr<framework::OpDesc>> ops;
     auto x_g = InputGrad("X");
     if (!x_g.empty()) {
-      auto *x_g_op = new framework::OpDescBind();
+      auto *x_g_op = new framework::OpDesc();
       x_g_op->SetType("scale");
       x_g_op->SetInput("X", OutputGrad("Out"));
       x_g_op->SetOutput("Out", x_g);
@@ -85,7 +84,7 @@ class MinusGradMaker : public framework::GradOpDescMakerBase {
 
     auto y_g = InputGrad("Y");
     if (!y_g.empty()) {
-      auto *y_g_op = new framework::OpDescBind();
+      auto *y_g_op = new framework::OpDesc();
       y_g_op->SetType("scale");
       y_g_op->SetInput("X", OutputGrad("Out"));
       y_g_op->SetOutput("Out", y_g);
@@ -102,5 +101,5 @@ class MinusGradMaker : public framework::GradOpDescMakerBase {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradMaker);
-REGISTER_OP_CPU_KERNEL(minus,
-                       ops::MinusKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    minus, ops::MinusKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/minus_op.cu b/paddle/operators/minus_op.cu
index a8375cc6301b2c1a917299c3933b03226bb72907..80cd9f7c16845904b7b46ae1597ce9558c32f46a 100644
--- a/paddle/operators/minus_op.cu
+++ b/paddle/operators/minus_op.cu
@@ -1,18 +1,19 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/minus_op.h"
 
-REGISTER_OP_GPU_KERNEL(
-    minus, paddle::operators::MinusKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    minus,
+    paddle::operators::MinusKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/minus_op.h b/paddle/operators/minus_op.h
index bd9a2790aa2b208c2d3dfc792031283eb6c42397..20760b8cd5bd2f74ed8469addda8f67f11f4545c 100644
--- a/paddle/operators/minus_op.h
+++ b/paddle/operators/minus_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MinusKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -28,7 +28,8 @@ class MinusKernel : public framework::OpKernel<T> {
     auto* out_tensor = context.Output<framework::Tensor>("Out");
 
     out_tensor->mutable_data<T>(context.GetPlace());
-    auto& dev = context.GetEigenDevice<Place>();
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
     framework::EigenVector<T>::Flatten(*out_tensor).device(dev) =
         framework::EigenVector<T>::Flatten(*left_tensor) -
         framework::EigenVector<T>::Flatten(*right_tensor);
diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc
index 28528848af1f467bf38be53f9d05fee6ca3f93cc..f5d69071a86e3f8037840c091cf5b7683e4eeb96 100644
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/modified_huber_loss_op.h"
 
@@ -39,8 +39,7 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
 
 class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ModifiedHuberLossOpMaker(framework::OpProto* proto,
-                           framework::OpAttrChecker* op_checker)
+  ModifiedHuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input tensor of modified huber loss op. "
@@ -115,6 +114,6 @@ REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp,
 
 REGISTER_OP_CPU_KERNEL(
     modified_huber_loss,
-    ops::ModifiedHuberLossKernel<paddle::platform::CPUPlace, float>);
+    ops::ModifiedHuberLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(modified_huber_loss_grad,
                        ops::ModifiedHuberLossGradCPUKernel<float>);
diff --git a/paddle/operators/modified_huber_loss_op.cu b/paddle/operators/modified_huber_loss_op.cu
index 8854e166cd99ce914d7f9f9bcead3234b0649506..3d2a5562e8cc2117b0b460496d9ba8e96823fbfb 100644
--- a/paddle/operators/modified_huber_loss_op.cu
+++ b/paddle/operators/modified_huber_loss_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
 
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <thrust/for_each.h>
 #include <thrust/tuple.h>
 #include "paddle/framework/op_registry.h"
@@ -71,8 +71,8 @@ class ModifiedHuberLossGradGPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     modified_huber_loss,
-    ops::ModifiedHuberLossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(modified_huber_loss_grad,
-                       ops::ModifiedHuberLossGradGPUKernel<float>);
+    ops::ModifiedHuberLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(modified_huber_loss_grad,
+                        ops::ModifiedHuberLossGradGPUKernel<float>);
diff --git a/paddle/operators/modified_huber_loss_op.h b/paddle/operators/modified_huber_loss_op.h
index aba75efad9c19e3e113b4f09bc1fbd4732f4e187..6ce86feee574efca8811f316f47f1c3fbbdd0bf9 100644
--- a/paddle/operators/modified_huber_loss_op.h
+++ b/paddle/operators/modified_huber_loss_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -46,7 +46,7 @@ struct ModifiedHuberLossForward {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ModifiedHuberLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -57,7 +57,8 @@ class ModifiedHuberLossKernel : public framework::OpKernel<T> {
 
     out0->mutable_data<T>(context.GetPlace());
     out1->mutable_data<T>(context.GetPlace());
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto x = EigenVector<T>::Flatten(*in0);
     auto y = EigenVector<T>::Flatten(*in1);
diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc
index 19954006195c1e9fd34328b52ed2a9eade526235..15b8b80776732f43c3ef4f8b80cffedf5c2a76fd 100644
--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
@@ -54,8 +54,7 @@ class MomentumOp : public framework::OperatorWithKernel {
 
 class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MomentumOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  MomentumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
@@ -71,8 +70,12 @@ class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor, default Tensor<float>) "
              "Input learning rate");
 
-    AddOutput("ParamOut", "(Tensor) Output updated parameter");
-    AddOutput("VelocityOut", "(Tensor) Output updated velocity");
+    AddOutput("ParamOut",
+              "(Tensor) This output is updated parameter. "
+              "It shared memory with Input(Param).");
+    AddOutput("VelocityOut",
+              "(Tensor) This output is updated velocity. "
+              "It shared memory with Input(Velocity).");
 
     AddAttr<float>("mu", "(float) Momentum coefficient");
     AddAttr<bool>("use_nesterov",
@@ -101,5 +104,5 @@ $$
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(momentum, ops::MomentumOp, ops::MomentumOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    momentum, ops::MomentumOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(momentum, ops::MomentumOpKernel<float>,
+                       ops::MomentumOpKernel<double>);
diff --git a/paddle/operators/momentum_op.cu b/paddle/operators/momentum_op.cu
index efc24e795e05951024009f0b3258769c352df344..2b9314162e6f10b3791c913203c732d2822861ab 100644
--- a/paddle/operators/momentum_op.cu
+++ b/paddle/operators/momentum_op.cu
@@ -1,20 +1,78 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
-#define EIGEN_USE_GPU
-#include "paddle/operators/momentum_op.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void MomentumKernel(const T* p, const T* g, const T* v,
+                               const T* learning_rate, const T mu,
+                               const int64_t num, bool use_nesterov, T* p_out,
+                               T* v_out) {
+  T lr = learning_rate[0];
+  if (use_nesterov) {
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+         i += blockDim.x * gridDim.x) {
+      T g_val = g[i];
+      T v_new = v[i] * mu + g_val;
+      v_out[i] = v_new;
+      p_out[i] = p[i] - (g_val - v_new * mu) * lr;
+    }
+  } else {
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+         i += blockDim.x * gridDim.x) {
+      T v_new = v[i] * mu + g[i];
+      v_out[i] = v_new;
+      p_out[i] = p[i] - lr * v_new;
+    }
+  }
+}
+
+template <typename T>
+class MomentumOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
+    auto param = ctx.Input<framework::Tensor>("Param");
+    auto velocity = ctx.Input<framework::Tensor>("Velocity");
+    auto grad = ctx.Input<framework::Tensor>("Grad");
+    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
+    T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
+
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
+    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
+
+    auto* p = param->data<T>();
+    auto* v = velocity->data<T>();
+    auto* g = grad->data<T>();
+    auto* lr = learning_rate->data<T>();
+
+    int block = 512;
+    int grid = (param->numel() + block - 1) / block;
+    MomentumKernel<T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
+        p, g, v, lr, mu, param->numel(), use_nesterov, p_out, v_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    momentum, ops::MomentumOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(momentum, ops::MomentumOpCUDAKernel<float>,
+                        ops::MomentumOpCUDAKernel<double>);
diff --git a/paddle/operators/momentum_op.h b/paddle/operators/momentum_op.h
index 8f7f5eb5c21c0342f57a47b85d28f4454f4566c2..da69532ea58bad8d3908770d82dbcc6e6b108fce 100644
--- a/paddle/operators/momentum_op.h
+++ b/paddle/operators/momentum_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename T>
 class MomentumOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -33,7 +33,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
     param_out->mutable_data<T>(ctx.GetPlace());
     velocity_out->mutable_data<T>(ctx.GetPlace());
 
-    float mu = ctx.Attr<float>("mu");
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
     bool use_nesterov = ctx.Attr<bool>("use_nesterov");
 
     auto p_out = framework::EigenVector<T>::Flatten(*param_out);
@@ -42,18 +42,13 @@ class MomentumOpKernel : public framework::OpKernel<T> {
     auto p = framework::EigenVector<T>::Flatten(*param);
     auto v = framework::EigenVector<T>::Flatten(*velocity);
     auto g = framework::EigenVector<T>::Flatten(*grad);
-    auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
+    auto* lr = learning_rate->data<T>();
 
-    auto place = ctx.GetEigenDevice<Place>();
-
-    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
-
-    v_out.device(place) = v * mu + g;
+    v_out = v * mu + g;
     if (use_nesterov) {
-      p_out.device(place) = p - g * lr.broadcast(grad_dsize) +
-                            v_out * mu * lr.broadcast(grad_dsize);
+      p_out = p - (g - v_out * mu) * lr[0];
     } else {
-      p_out.device(place) = p - lr.broadcast(grad_dsize) * v_out;
+      p_out = p - lr[0] * v_out;
     }
   }
 };
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 3c39ae10dc50084cff284c307167c33c9208a3ce..c923e988a55b43ebb7ba6256e7b72a85c124f360 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -71,41 +71,52 @@ class MulOpShapeInference : public framework::InferShapeBase {
 
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MulOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of mul op");
-    AddInput("Y", "The second input of mul op");
-    AddOutput("Out", "The output of mul op");
+    AddInput("X", "(Tensor), The first input tensor of mul op.");
+    AddInput("Y", "(Tensor), The second input tensor of mul op.");
+    AddOutput("Out", "(Tensor), The output tensor of mul op.");
     AddAttr<int>(
         "x_num_col_dims",
-        "(int, default 1) "
-        R"DOC(mul_op can take tensors with more than two dimensions as input `X`,
-            in that case, tensors will be reshaped to a matrix. The matrix's first
-            dimension(column length) will be the product of tensor's last
-            `num_col_dims` dimensions, and the matrix's second dimension(row length)
-            will be the product of tensor's first `rank - num_col_dims` dimensions.
+        R"DOC((int, default 1), The mul_op can take tensors with more than two
+              dimensions as its inputs. If the input $X$ is a tensor with more
+              than two dimensions, $X$ will be flattened into a two-dimensional
+              matrix first. The flattening rule is: the first `num_col_dims`
+              will be flattened to form the first dimension of the final matrix
+              (the height of the matrix), and the rest `rank(X) - num_col_dims`
+              dimensions are flattened to form the second dimension of the final
+              matrix (the width of the matrix). As a result, height of the
+              flattened matrix is equal to the product of $X$'s first
+              `x_num_col_dims` dimensions' sizes, and width of the flattened
+              matrix is equal to the product of $X$'s last `rank(x) - num_col_dims`
+              dimensions' size. For example, suppose $X$ is a 6-dimensional
+              tensor with the shape [2, 3, 4, 5, 6], and `x_num_col_dims` = 3.
+              Thus, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] =
+              [24, 30].
         )DOC")
         .SetDefault(1)
         .EqualGreaterThan(1);
     AddAttr<int>(
         "y_num_col_dims",
-        "(int, default 1) "
-        R"DOC(mul_op can take tensors with more than two dimensions as input `Y`,
-             in that case, tensors will be reshaped to a matrix. Just like input `X`.
+        R"DOC((int, default 1), The mul_op can take tensors with more than two,
+              dimensions as its inputs. If the input $Y$ is a tensor with more
+              than two dimensions, $Y$ will be flattened into a two-dimensional
+              matrix first. The attribute `y_num_col_dims` determines how $Y$ is
+              flattened. See comments of `x_num_col_dims` for more details.
         )DOC")
         .SetDefault(1)
         .EqualGreaterThan(1);
     AddComment(R"DOC(
-Mul Operator. 
+Mul Operator.
 
-This operator is used to perform matrix multiplication for input X and Y.
+This operator is used to perform matrix multiplication for input $X$ and $Y$.
 
 The equation is:
 
-    $$Out = X * Y$$
+$$Out = X * Y$$
 
-Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input `X`.
+Both the input $X$ and $Y$ can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input $X$.
 
 )DOC");
   }
@@ -149,6 +160,7 @@ REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker,
                   ops::MulOpShapeInference,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(mul_grad, ops::MulOpGrad);
-REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(mul_grad,
-                       ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/mul_op.cu.cc b/paddle/operators/mul_op.cu.cc
index 66dc3d6d106a18640adad413d4e967fa101abcfc..43de9a719499e4e0e8fd2e5fcc6771d717ce6522 100644
--- a/paddle/operators/mul_op.cu.cc
+++ b/paddle/operators/mul_op.cu.cc
@@ -1,20 +1,21 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/mul_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(mul_grad,
-                       ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    mul_grad, ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index 0eb9df41e9415845f88af283de63856158b447f9..1fb0569b49cce80c3f1e408fb57b5f5cf7033a27 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   You may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -23,7 +23,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -46,15 +46,16 @@ class MulKernel : public framework::OpKernel<T> {
     if (z_dim.size() != 2) {
       z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
     }
-    math::matmul<Place, T>(context.device_context(), x_matrix, false, y_matrix,
-                           false, 1, z, 0);
+    math::matmul<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), x_matrix, false,
+        y_matrix, false, 1, z, 0);
     if (z_dim.size() != 2) {
       z->Resize(z_dim);
     }
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MulGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -77,6 +78,7 @@ class MulGradKernel : public framework::OpKernel<T> {
 
     Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
     if (dx) {
       dx->mutable_data<T>(ctx.GetPlace());
       Tensor dx_matrix = dx->dims().size() > 2
@@ -84,8 +86,8 @@ class MulGradKernel : public framework::OpKernel<T> {
                              : *dx;
 
       // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      math::matmul<Place, T>(ctx.device_context(), dout_mat, false, y_matrix,
-                             true, 1, &dx_matrix, 0);
+      math::matmul<DeviceContext, T>(dev_ctx, dout_mat, false, y_matrix, true,
+                                     1, &dx_matrix, 0);
     }
     if (dy) {
       dy->mutable_data<T>(ctx.GetPlace());
@@ -93,8 +95,8 @@ class MulGradKernel : public framework::OpKernel<T> {
                              ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
                              : *dy;
       // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      math::matmul<Place, T>(ctx.device_context(), x_matrix, true, dout_mat,
-                             false, 1, &dy_matrix, 0);
+      math::matmul<DeviceContext, T>(dev_ctx, x_matrix, true, dout_mat, false,
+                                     1, &dy_matrix, 0);
     }
   }
 };
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index f8527dfab3f3c42f430c433a11351f12b8dfae8b..78263da2fbf843f6a5af2ba95aa0b219a7523b52 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/multiplex_op.h"
 
@@ -51,7 +51,7 @@ class MultiplexOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
@@ -61,8 +61,7 @@ class MultiplexOp : public framework::OperatorWithKernel {
 
 class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MultiplexOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  MultiplexOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Ids", "The index tensor of multiplex operator.");
     AddInput("X", "The candidate tensors of multiplex operator.")
@@ -99,17 +98,11 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
                    "Output(X@Grad) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null.");
-    std::vector<framework::DDim> d_ins;
-    auto ins = ctx->GetInputsDim("X");
-    // No need to compute gradient for Input(Ids)
-    for (size_t i = 0; i < ins.size(); i++) {
-      d_ins.push_back(ins[i]);
-    }
-    ctx->SetOutputsDim(framework::GradVarName("X"), d_ins);
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
@@ -125,7 +118,8 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<false>);
 REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
 REGISTER_OP_CPU_KERNEL(
-    multiplex, ops::MultiplexCPUKernel<paddle::platform::CPUPlace, float>);
+    multiplex,
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     multiplex_grad,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUPlace, float>);
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index 49ed8a8879527fd32dd8b001ea256e46a0353487..4372dc2c65ec7c0f28e46cd070ea471701ce8304 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/multiplex_op.h"
@@ -33,10 +33,10 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context());
+    Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
     auto stream = ctx.cuda_device_context().stream();
-    Place place = boost::get<Place>(ctx.GetPlace());
+    platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
       int32_t k = index[i];
       PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
@@ -60,7 +60,8 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
       if (d_ins[i]) {
         d_ins[i]->mutable_data<T>(ctx.GetPlace());
         auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(ctx.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+        t.device(*ctx.template device_context<Place>().eigen_device()) =
+            t.constant(static_cast<T>(0));
       }
     }
 
@@ -68,11 +69,11 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context());
+    Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
 
     auto stream = ctx.cuda_device_context().stream();
-    Place place = boost::get<Place>(ctx.GetPlace());
+    platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
       size_t k = static_cast<size_t>(index[i]);
       if (d_ins[k]) {
@@ -87,8 +88,9 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
-    multiplex, ops::MultiplexGPUKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    multiplex,
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     multiplex_grad,
-    ops::MultiplexGradGPUKernel<paddle::platform::GPUPlace, float>);
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/multiplex_op.h b/paddle/operators/multiplex_op.h
index ab3cafaa324a29d6f249cf1f73db92e1364eebc8..ef66be5556ee613a037de13286ecc66b53885c1f 100644
--- a/paddle/operators/multiplex_op.h
+++ b/paddle/operators/multiplex_op.h
@@ -1,17 +1,16 @@
-
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -22,7 +21,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MultiplexCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -35,7 +34,7 @@ class MultiplexCPUKernel : public framework::OpKernel<T> {
     auto rows = ins[0]->dims()[0];
     auto cols = ins[0]->numel() / rows;
     auto index = ids->data<int32_t>();
-    Place place = boost::get<Place>(ctx.GetPlace());
+    platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
       int32_t k = index[i];
       PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
@@ -47,7 +46,7 @@ class MultiplexCPUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MultiplexGradCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -60,14 +59,15 @@ class MultiplexGradCPUKernel : public framework::OpKernel<T> {
       if (d_ins[i]) {
         d_ins[i]->mutable_data<T>(ctx.GetPlace());
         auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(ctx.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+        t.device(*ctx.template device_context<DeviceContext>().eigen_device()) =
+            t.constant(static_cast<T>(0));
       }
     }
 
     auto rows = ins[0]->dims()[0];
     auto cols = ins[0]->numel() / rows;
     auto* index = ids->data<int32_t>();
-    Place place = boost::get<Place>(ctx.GetPlace());
+    platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
       size_t k = static_cast<size_t>(index[i]);
       if (d_ins[k]) {
diff --git a/paddle/operators/nccl/nccl_gpu_common.cc b/paddle/operators/nccl/nccl_gpu_common.cc
index 6be735e4c731f79684e0bdac3d69a30b328fed84..1602a3d9b54dd64813770a7162f8d4f3dd0e791a 100644
--- a/paddle/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/operators/nccl/nccl_gpu_common.cc
@@ -1,13 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/nccl/nccl_gpu_common.h"
 #include "paddle/platform/gpu_info.h"
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index 48e322f99398a7f1d6af9cab653d0cc92d981fe0..5173996f2020ec7a94643277e8c7a532d41d9045 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index 66fcc09bc877867e66a37adc73230d8dabf4cbed..9d51153b0631b988c9297f395672be67e18ee3f9 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -1,13 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/nccl/nccl_gpu_common.h"
@@ -24,7 +27,7 @@ class NCCLInitOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     const auto &name = Output("Communicator");
     PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
                             "Can not find variable '%s' in the scope.", name);
@@ -43,16 +46,15 @@ class NCCLInitOp : public framework::OperatorBase {
 
 class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLInitOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  NCCLInitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Communicator",
               "Create Communicator for communicating between gpus");
     AddAttr<std::vector<int>>("gpus", "(vector<int>) GPU id lists");
-    AddAttr<int>("data_type",
+    AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
     AddComment(R"DOC(
 NCCLInit Operator.
 
@@ -141,8 +143,7 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
 // AllreduceOp
 class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLAllReduceOpMaker(framework::OpProto *proto,
-                       framework::OpAttrChecker *op_checker)
+  NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of AllReduce op");
     AddInput("Communicator", "Communicator for communicating between gpus");
@@ -163,8 +164,7 @@ AllReduce the input tensors.
 // ReduceOp
 class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLReduceOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
+  NCCLReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of Reduce op");
     AddInput("Communicator", "Communicator for communicating between gpus");
@@ -190,8 +190,7 @@ Reduce the tensors.
 // BcastOp
 class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLBcastOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  NCCLBcastOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of BcastSend op");
     AddInput("Communicator", "Communicator for communicating between gpus");
diff --git a/paddle/operators/nccl_op.cu.cc b/paddle/operators/nccl_op.cu.cc
index 4f0a2a79edb9f24c7758fc91483d374425b36853..1b986a13650de7d77f4828d71798ee00d61c1284 100644
--- a/paddle/operators/nccl_op.cu.cc
+++ b/paddle/operators/nccl_op.cu.cc
@@ -67,7 +67,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
     auto stream = ctx.cuda_device_context().stream();
 
     // device id
-    int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = comm->GetCommId(gpu_id);
 
     for (size_t i = 0; i < ins.size(); ++i) {
@@ -120,7 +120,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
                       ctx.device_context())
                       .stream();
     // device id
-    int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = comm->GetCommId(gpu_id);
 
     auto ins_names = ctx.Inputs("X");
@@ -164,7 +164,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
                       ctx.device_context())
                       .stream();
     // device id
-    int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = comm->GetCommId(gpu_id);
 
     if (idx == root) {
@@ -204,6 +204,6 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
-REGISTER_OP_GPU_KERNEL(ncclBcast, ops::NCCLBcastKernel<float>);
-REGISTER_OP_GPU_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
+REGISTER_OP_CUDA_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
+REGISTER_OP_CUDA_KERNEL(ncclBcast, ops::NCCLBcastKernel<float>);
+REGISTER_OP_CUDA_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc
index 56ba57854955c08031214d1f751c17fbb8bb882c..6546096069d4c3fbc4908a16c2dba2ac6d7e6421 100644
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -22,6 +22,7 @@
 #include <vector>
 
 #include "paddle/framework/block_desc.h"
+#include "paddle/framework/init.h"
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/program_desc.h"
@@ -33,9 +34,9 @@
 #include "paddle/platform/place.h"
 
 USE_NO_KERNEL_OP(ncclInit);
-USE_GPU_ONLY_OP(ncclAllReduce);
-USE_GPU_ONLY_OP(ncclReduce);
-USE_GPU_ONLY_OP(ncclBcast);
+USE_CUDA_ONLY_OP(ncclAllReduce);
+USE_CUDA_ONLY_OP(ncclReduce);
+USE_CUDA_ONLY_OP(ncclBcast);
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
@@ -49,9 +50,9 @@ const f::DDim kDims = {100, 100};
 class NCCLTester : public ::testing::Test {
  public:
   virtual void SetUp() override {
-    cpu_ctx = new p::CPUDeviceContext(p::CPUPlace());
+    paddle::platform::CPUPlace cpu_place;
     for (size_t i = 0; i < gpu_list.size(); ++i) {
-      p::GPUPlace place(i);
+      p::CUDAPlace place(i);
       dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
     }
 
@@ -65,7 +66,8 @@ class NCCLTester : public ::testing::Test {
   }
 
   void NCCLInitOp() {
-    std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
+    paddle::platform::CPUPlace cpu_place;
+    std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
 
     op1->SetType("ncclInit");
     op1->SetOutput("Communicator", {"comm"});
@@ -76,17 +78,16 @@ class NCCLTester : public ::testing::Test {
 
     auto op = f::OpRegistry::CreateOp(*op1);
     VLOG(1) << "invoke NCCLInitOp.";
-    op->Run(g_scope, *cpu_ctx);
+    op->Run(g_scope, cpu_place);
     VLOG(1) << "NCCLInitOp finished.";
   }
 
   template <class T>
-  void PerThreadProgram(int gpu_id, const f::OpDescBind &op_desc,
-                        f::Scope *scope) {
+  void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
     std::unique_lock<std::mutex> lk(mu);
-    const f::OpDescBind *op1 = &op_desc;
+    const f::OpDesc *op1 = &op_desc;
 
-    p::GPUPlace place(gpu_id);
+    p::CUDAPlace place(gpu_id);
     auto &ctx = dev_ctxs.at(gpu_id);
 
     auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
@@ -97,7 +98,7 @@ class NCCLTester : public ::testing::Test {
       send_tensor->mutable_data<T>(kDims, place);
 
       std::vector<T> send_vector(f::product(kDims), gpu_id);
-      send_tensor->CopyFromVector<T>(send_vector, *ctx);
+      paddle::framework::CopyFromVector<T>(send_vector, *ctx, send_tensor);
       ctx->Wait();
       VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
     }
@@ -112,40 +113,39 @@ class NCCLTester : public ::testing::Test {
     VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
     VLOG(1) << " send_tensor : " << send_tensor->numel()
             << " recv_tensor : " << recv_tensor->numel();
-    op->Run(*scope, *ctx);
+    op->Run(*scope, place);
     VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
   }
 
  public:
   std::vector<p::DeviceContext *> dev_ctxs;
-  p::DeviceContext *cpu_ctx;
   f::Scope g_scope;
   std::mutex mu;
 };
 
 // ncclInitOp with desc
 TEST(NCCL, ncclInitOp) {
-  std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op_desc(new f::OpDesc);
 
   op_desc->SetType("ncclInit");
   op_desc->SetOutput("Communicator", {"x1"});
   op_desc->SetAttr("gpus", {gpu_list});
 
   f::Scope g_scope;
-  std::unique_ptr<p::DeviceContext> ctx(new p::CPUDeviceContext(p::CPUPlace()));
+  paddle::platform::CPUPlace cpu_place;
 
   auto *var = g_scope.Var("x1");
   var->GetMutable<p::Communicator>();
 
   auto op = f::OpRegistry::CreateOp(*op_desc);
   VLOG(1) << "invoke NCCLInitOp.";
-  op->Run(g_scope, *ctx.get());
+  op->Run(g_scope, cpu_place);
   VLOG(1) << "NCCLInitOp finished.";
 }
 
 // ncclAllReduceOp with desc
 TEST_F(NCCLTester, ncclAllReduceOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   op2->SetType("ncclAllReduce");
   op2->SetInput("X", {"st"});
   op2->SetInput("Communicator", {"comm"});
@@ -171,7 +171,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
 
   for (size_t i = 0; i < dev_scopes.size(); ++i) {
     p::CPUPlace cpu_place;
-    p::GPUPlace gpu_place(gpu_list[i]);
+    p::CUDAPlace gpu_place(gpu_list[i]);
 
     auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
     auto *rt = recv_tensor.data<float>();
@@ -180,7 +180,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
     auto *ct = result_tensor->mutable_data<float>(cpu_place);
 
     paddle::memory::Copy(
-        cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
+        cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt,
         recv_tensor.numel() * sizeof(float),
         static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
 
@@ -192,7 +192,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
 
 // ncclReduceOp with desc
 TEST_F(NCCLTester, ncclReduceOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   const int kRoot = 0;
   op2->SetType("ncclReduce");
   op2->SetInput("X", {"st"});
@@ -219,7 +219,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
   float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
 
   p::CPUPlace cpu_place;
-  p::GPUPlace gpu_place(gpu_list[kRoot]);
+  p::CUDAPlace gpu_place(gpu_list[kRoot]);
 
   auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
   auto *rt = recv_tensor.data<float>();
@@ -229,7 +229,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
   auto *ct = result_tensor->mutable_data<float>(cpu_place);
 
   paddle::memory::Copy(
-      cpu_place, ct, p::GPUPlace(gpu_list[kRoot]), rt,
+      cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt,
       recv_tensor.numel() * sizeof(float),
       static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
 
@@ -240,7 +240,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
 
 // ncclBcastOp with desc
 TEST_F(NCCLTester, ncclBcastOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   const int kRoot = 5;
   op2->SetType("ncclBcast");
   op2->SetInput("X", {"st"});
@@ -268,7 +268,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
   float result = kRoot;
 
   p::CPUPlace cpu_place;
-  p::GPUPlace gpu_place(gpu_list[idx]);
+  p::CUDAPlace gpu_place(gpu_list[idx]);
 
   auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
   auto *rt = recv_tensor.data<float>();
@@ -277,7 +277,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
   auto *ct = result_tensor->mutable_data<float>(cpu_place);
 
   paddle::memory::Copy(
-      cpu_place, ct, p::GPUPlace(gpu_list[idx]), rt,
+      cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt,
       recv_tensor.numel() * sizeof(float),
       static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
 
@@ -295,9 +295,18 @@ int main(int argc, char **argv) {
     return 0;
   }
 
-  for (int i = 0; i < dev_count; ++i) {
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::CUDAPlace(i));
     gpu_list.emplace_back(i);
   }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Init(places);
+
   testing::InitGoogleTest(&argc, argv);
 
   // device context should be release before scope.
diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..84ba3ead2b52547b989a4541f31ea31ffcce6c63
--- /dev/null
+++ b/paddle/operators/nce_op.cc
@@ -0,0 +1,186 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/nce_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class NCEOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"));
+    PADDLE_ENFORCE(ctx->HasInput("Label"));
+    PADDLE_ENFORCE(ctx->HasInput("Weight"));
+    PADDLE_ENFORCE(ctx->HasOutput("Cost"));
+    PADDLE_ENFORCE(ctx->HasOutput("SampleLogits"));
+    PADDLE_ENFORCE(ctx->HasOutput("SampleLabels"));
+
+    auto x_dims = ctx->GetInputDim("Input");
+    auto label_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
+    int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1;
+    if (ctx->HasInput("Bias")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Weight")[0],
+                        ctx->GetInputDim("Bias")[0]);
+    }
+    auto num_neg_samples = ctx->Attrs().Get<int>("num_neg_samples");
+    auto num_total_classes = ctx->Attrs().Get<int>("num_total_classes");
+    std::vector<int> custom_neg_classes =
+        ctx->Attrs().Get<std::vector<int>>("custom_neg_classes");
+    PADDLE_ENFORCE_EQ(num_total_classes, ctx->GetInputDim("Weight")[0]);
+    if (custom_neg_classes.size() > 0) {
+      PADDLE_ENFORCE_EQ(custom_neg_classes.size(),
+                        static_cast<size_t>(num_neg_samples));
+    }
+    // set dims of output(Out)
+    std::vector<int64_t> out_dims;
+    out_dims.push_back(x_dims[0]);
+    out_dims.push_back(1);
+    ctx->SetOutputDim("Cost", framework::make_ddim(out_dims));
+
+    // set dims of output(SampleOut)
+    std::vector<int64_t> sample_out_dims;
+    sample_out_dims.push_back(x_dims[0]);
+    sample_out_dims.push_back(num_neg_samples + num_true_classes);
+    ctx->SetOutputDim("SampleLogits", framework::make_ddim(sample_out_dims));
+    ctx->SetOutputDim("SampleLabels", framework::make_ddim(sample_out_dims));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.GetPlace());
+  }
+};
+
+class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCEOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim].");
+    AddInput(
+        "Label",
+        "(Tensor) A tensor of shape [batch_size, num_true_class]. "
+        "'num_true_class' is the number of target classes in each sample."
+        "The number of target classes per sample should be same. "
+        "If you have a variable number of target classes, "
+        "you can pad them out to a constant number by either repeating them"
+        " or by padding with an otherwise unused class.)");
+    AddInput("Weight",
+             "(Tensor) A tensor of shape [num_class, dim]. 'num_class' is the "
+             "total number of class.");
+    AddInput(
+        "Bias",
+        "(Tensor) A tensor of shape [num_class, 1]. 'num_class' is the total "
+        "number of class. It is a dispensable input.")
+        .AsDispensable();
+    AddInput("SampleWeight",
+             "(Tensor) A tensor of shape [batch_size, 1] storing a weight for "
+             "each sample. And it is a dispensable input. The default value of "
+             "sample is 1.")
+        .AsDispensable();
+    AddOutput("Cost",
+              "(Tensor) A tensor of shape [batch_size, 1]. Cost of samples.");
+    AddOutput("SampleLogits",
+              "An intermediate tensor of shape[batch_size, num_neg_samples + "
+              "num_pos_samples]."
+              "This tensor is output of forward kernel and used in backward "
+              "kernel to compute grads."
+              "Given X is  the dot product of input tensor and sampled labels' "
+              "weights."
+              "Then 'SampleLogits' is sigmoid(X).")
+        .AsIntermediate();
+    AddOutput("SampleLabels",
+              "An intermediate tensor of shape[batch_size, num_neg_samples + "
+              "num_pos_samples]."
+              "This tensor is output of forward kernel and used in backward "
+              "kernel to compute grads."
+              "")
+        .AsIntermediate();
+    AddAttr<int>("num_total_classes",
+                 "Total number of classes in all samples.");
+    AddAttr<int>("num_neg_samples",
+                 "The number of negative classes. The default value is 10.")
+        .SetDefault(10);
+    AddAttr<std::vector<int>>("custom_neg_classes",
+                              "This attribute only be used in unitest. Classes "
+                              "in this list wiil be used as negative classes "
+                              "for every samples. Under normal conditions, "
+                              "user should avoid setting this attribute.");
+    AddComment(R"DOC(
+Compute and return the noise-contrastive estimation training loss.
+See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+By default this operator uses a uniform distribution for sampling.
+)DOC");
+  }
+};
+
+class NCEOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"));
+    PADDLE_ENFORCE(ctx->HasInput("Weight"));
+    PADDLE_ENFORCE(ctx->HasInput("Cost"));
+    PADDLE_ENFORCE(ctx->HasInput("SampleLogits"));
+    PADDLE_ENFORCE(ctx->HasInput("SampleLabels"));
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cost")),
+                   "The input(Out@GRAD) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("Input");
+    auto x_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+
+    auto w_dims = ctx->GetInputDim("Weight");
+    auto w_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(w_grad_name)) {
+      ctx->SetOutputDim(w_grad_name, w_dims);
+    }
+
+    auto bias_grad_name = framework::GradVarName("Bias");
+    if (ctx->HasOutput(bias_grad_name)) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad);
+REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>,
+                       ops::NCEKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(nce_grad,
+                       ops::NCEGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::NCEGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6b496f7896dcb412be8ff096fdccb2f0b682369
--- /dev/null
+++ b/paddle/operators/nce_op.h
@@ -0,0 +1,211 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <math.h>
+#include <random>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+void PrepareSamples(const framework::ExecutionContext& context) {
+  auto label = context.Input<Tensor>("Label");
+  const int64_t* label_data = label->data<int64_t>();
+  auto label_dims = label->dims();
+  int num_total_classes = context.Attr<int>("num_total_classes");
+  // for unitest
+  std::vector<int> custom_neg_classes =
+      context.Attr<std::vector<int>>("custom_neg_classes");
+  // random machine
+  std::random_device rd;
+  std::mt19937 rng(rd());
+  std::uniform_int_distribution<int> rand(0, num_total_classes - 1);
+
+  auto sample_labels = context.Output<Tensor>("SampleLabels");
+  auto sample_labels_dims = sample_labels->dims();
+  int64_t* sample_labels_data =
+      sample_labels->mutable_data<int64_t>(context.GetPlace());
+
+  int num_label = label_dims.size() == 2 ? label_dims[1] : 1;
+  int index = 0;
+  for (int64_t i = 0; i < label_dims[0]; ++i) {
+    int j = 0;
+    for (; j < num_label; ++j) {
+      sample_labels_data[index++] = label_data[i * num_label + j];
+    }
+    if (custom_neg_classes.size() > 0) {
+      for (auto label : custom_neg_classes) {
+        sample_labels_data[index++] = label;
+      }
+    } else {
+      for (; j < sample_labels_dims[1]; ++j) {
+        // TODO(wanghaoshuang): support more distribution sampling
+        sample_labels_data[index++] = rand(rng);
+      }
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class NCEKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PrepareSamples<DeviceContext, T>(context);
+    auto sample_labels = context.Output<Tensor>("SampleLabels");
+    const int64_t* sample_labels_data = sample_labels->data<int64_t>();
+    auto sample_out = context.Output<Tensor>("SampleLogits");
+    T* sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
+    auto label = context.Input<Tensor>("Label");
+    auto sample_weight = context.Input<Tensor>("SampleWeight");
+    const T* sample_weight_data = nullptr;
+    if (sample_weight != nullptr) {
+      sample_weight_data = sample_weight->data<T>();
+    }
+    auto out = context.Output<Tensor>("Cost");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    int num_neg_samples = context.Attr<int>("num_neg_samples");
+    int num_total_classes = context.Attr<int>("num_total_classes");
+    int64_t num_true_class = 1;
+    if (label != nullptr) {
+      num_true_class = label->dims()[1];
+    }
+    T b = 1. / num_total_classes * num_neg_samples;
+    // forward bias
+    auto bias = context.Input<Tensor>("Bias");
+    if (bias != nullptr) {
+      const T* bias_data = bias->data<T>();
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        sample_out_data[i] = bias_data[sample_labels_data[i]];
+      }
+    } else {
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        sample_out_data[i] = 0;
+      }
+    }
+    // forward mul
+    auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
+    auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
+    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+      Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
+          (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) *
+           weight_mat.chip(sample_labels_data[i], 0))
+              .sum();
+      sample_out_data[i] += result(0);
+      sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
+    }
+    // forward cost
+    for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) {
+      int64_t j = 0;
+      out_data[i] = 0;
+      T w = sample_weight == nullptr ? 1. : sample_weight_data[i];
+      // for true classes
+      for (; j < num_true_class; ++j) {
+        T o = sample_out_data[i * sample_out->dims()[1] + j];
+        T cost = -log(o / (o + b));
+        out_data[i] += w * cost;
+      }
+      // for sampled neg classes
+      for (; j < sample_labels->dims()[1]; ++j) {
+        T o = sample_out_data[i * sample_out->dims()[1] + j];
+        T cost = -log(b / (o + b));
+        out_data[i] += w * cost;
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class NCEGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto d_out = context.Input<Tensor>(framework::GradVarName("Cost"));
+    const T* d_out_data = d_out->data<T>();
+    auto label = context.Input<Tensor>("Label");
+    auto sample_out = context.Input<Tensor>("SampleLogits");
+    const T* sample_out_data = sample_out->data<T>();
+    auto sample_labels = context.Input<Tensor>("SampleLabels");
+    const int64_t* sample_labels_data = sample_labels->data<int64_t>();
+    auto sample_weight = context.Input<Tensor>("SampleWeight");
+    const T* sample_weight_data = nullptr;
+    if (sample_weight != nullptr) {
+      sample_weight_data = sample_weight->data<T>();
+    }
+    int num_neg_samples = context.Attr<int>("num_neg_samples");
+    int num_total_classes = context.Attr<int>("num_total_classes");
+    int num_true_class = 1;
+    if (label != nullptr) {
+      num_true_class = label->dims()[1];
+    }
+    T b = 1. / num_total_classes * num_neg_samples;
+    Tensor sample_grad;  // tmp tensor
+    T* sample_grad_data =
+        sample_grad.mutable_data<T>(sample_labels->dims(), context.GetPlace());
+    // backward cost
+    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+      T o = sample_out_data[i];
+      T w = sample_weight == nullptr
+                ? 1
+                : sample_weight_data[i / sample_labels->dims()[1]];
+      sample_grad_data[i] = (i % sample_labels->dims()[1]) < num_true_class
+                                ? w * (b / (o + b)) * (o - 1)
+                                : w * (o * (1 - o) / (o + b));
+      sample_grad_data[i] *= d_out_data[i / sample_labels->dims()[1]];
+    }
+    // get d_bias
+    auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias"));
+    if (d_bias != nullptr) {
+      T* d_bias_data = d_bias->mutable_data<T>(context.GetPlace());
+      std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0);
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        d_bias_data[sample_labels_data[i]] += sample_grad_data[i];
+      }
+    }
+    // get d_w
+    auto d_w = context.Output<Tensor>(framework::GradVarName("Weight"));
+    if (d_w != nullptr) {
+      auto d_w_data = d_w->mutable_data<T>(context.GetPlace());
+      std::fill(d_w_data, d_w_data + d_w->numel(), 0.0);
+      auto d_w_matrix = EigenMatrix<T>::From(*d_w);
+      auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        d_w_matrix.chip(sample_labels_data[i], 0) +=
+            x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) *
+            sample_grad_data[i];
+      }
+    }
+    // get d_x
+    auto d_x = context.Output<Tensor>(framework::GradVarName("Input"));
+    if (d_x != nullptr) {
+      d_x->mutable_data<T>(context.GetPlace());
+      auto d_x_matrix = EigenMatrix<T>::From(*d_x);
+      auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) +=
+            w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/net_op.cc b/paddle/operators/net_op.cc
index 78b5e2767842312722fac3509e843a05fe194559..f12074a5f2d08f14b58619fc86b6aa37e4cad132 100644
--- a/paddle/operators/net_op.cc
+++ b/paddle/operators/net_op.cc
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 /*
   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
@@ -56,11 +69,11 @@ void NetOp::CompleteAddOp(bool calc) {
   std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs));
 }
 
-std::string NetOp::DebugString() const {
+std::string NetOp::DebugStringEx(const framework::Scope* scope) const {
   std::ostringstream os;
-  os << OperatorBase::DebugString() << std::endl;
+  os << OperatorBase::DebugStringEx(scope) << std::endl;
   for (auto& op : ops_) {
-    std::istringstream is(op->DebugString());
+    std::istringstream is(op->DebugStringEx(scope));
     for (std::string line; std::getline(is, line);) {
       os << "    " << line << std::endl;
     }
diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h
index ebeb262d9621fa35c870b6407992f6b6d2bf7c70..b24042f5ef5822eabcada8ed9d21c552579e8064 100644
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -38,7 +38,10 @@ namespace operators {
 class NetOp : public framework::OperatorBase {
  public:
   static const char kAll[];
-  NetOp() : framework::OperatorBase("plain_net", {}, {}, {}) {}
+  NetOp()
+      : framework::OperatorBase("plain_net", framework::VariableNameMap{},
+                                framework::VariableNameMap{},
+                                framework::AttributeMap{}) {}
 
   NetOp(const std::string& type, const framework::VariableNameMap& inputs,
         const framework::VariableNameMap& outputs,
@@ -62,9 +65,9 @@ class NetOp : public framework::OperatorBase {
    * will be used.
    */
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& place) const override {
     for (auto& op : ops_) {
-      op->Run(scope, dev_ctx);
+      op->Run(scope, place);
     }
   }
 
@@ -103,7 +106,8 @@ class NetOp : public framework::OperatorBase {
 
   void CompleteAddOp(bool calculate = true);
 
-  std::string DebugString() const override;
+  std::string DebugStringEx(
+      const framework::Scope* scope = nullptr) const override;
 
   bool IsNetOp() const override;
   std::vector<std::string> OutputVars(bool has_intermediate) const override;
diff --git a/paddle/operators/net_op_test.cc b/paddle/operators/net_op_test.cc
index 63bebd5b44719868a38ddf2b023955d1ab05245c..9358f29f62fc21801f8036400d2baebdfd663a3a 100644
--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #include "paddle/operators/net_op.h"
 
 #include <gtest/gtest.h>
@@ -13,8 +26,7 @@ class TestOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
   DEFINE_OP_CLONE_METHOD(TestOp);
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+  void Run(const Scope& scope, const platform::Place& place) const override {
     ++run_cnt;
   }
 };
@@ -38,10 +50,10 @@ TEST(OpKernel, all) {
 
   net->AppendOp(std::unique_ptr<TestOp>(
       new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                 {{"Out", {"y"}}}, {})));
+                 {{"Out", {"y"}}}, framework::AttributeMap{})));
   net->AppendOp(std::unique_ptr<TestOp>(
       new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
-                 {{"Out", {"z"}}}, {})));
+                 {{"Out", {"z"}}}, framework::AttributeMap{})));
 
   net->CompleteAddOp();
   AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"},
@@ -58,7 +70,7 @@ TEST(NetOp, insert_op) {
   NetOp net;
   auto op1 = std::unique_ptr<framework::NOP>(
       new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                         {{"Out", {"y"}}}, {}));
+                         {{"Out", {"y"}}}, framework::AttributeMap{}));
   net.AppendOp(*op1);
   net.InsertOp(0, *op1);
   ASSERT_EQ(2UL, net.ops_.size());
@@ -68,10 +80,12 @@ TEST(NetOp, insert_op) {
 
 TEST(NetOp, Clone) {
   NetOp net;
-  net.AppendOp(
-      std::unique_ptr<framework::NOP>(new framework::NOP{"empty", {}, {}, {}}));
-  net.AppendOp(std::unique_ptr<framework::NOP>(
-      new framework::NOP{"empty2", {}, {}, {}}));
+  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
+      "empty", framework::VariableNameMap{}, framework::VariableNameMap{},
+      framework::AttributeMap{}}));
+  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
+      "empty2", framework::VariableNameMap{}, framework::VariableNameMap{},
+      framework::AttributeMap{}}));
   net.CompleteAddOp(true);
   auto new_net_op = net.Clone();
   ASSERT_NE(new_net_op, nullptr);
diff --git a/paddle/operators/norm_op.cc b/paddle/operators/norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0eeafcaae0a01ed7090800ecb06708773ede67aa
--- /dev/null
+++ b/paddle/operators/norm_op.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/norm_op.h"
+namespace paddle {
+namespace operators {
+
+template <typename AttrType>
+class NormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of norm operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddInput("Scale",
+             "(Tensor) The input tensor of norm operator. "
+             "The format of input tensor is C * 1.");
+    AddAttr<AttrType>("epsilon",
+                      "(float, default 1e-10) Constant "
+                      "for numerical stability.")
+        .SetDefault(1.0e-10f);
+    AddOutput("Out",
+              "(Tensor) The output tensor of norm operator."
+              "N * M."
+              "M = C * H * W");
+    AddComment(R"DOC(
+       "Input shape: $(N, C, H, W)$
+        Scale shape: $(C, 1)$
+        Output shape: $(N, C, H, W)$
+        Where
+        forward
+          $$
+            [\frac {x_{1}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{2}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{3}}{\sqrt{\sum{x_{i}^{2}}}} \cdot  \cdot  \cdot \frac {x_{n}}{\sqrt{\sum{x_{i}^{2}}}}]
+          $$
+        backward
+          $$
+            \frac{\frac{\mathrm{d}L }{\mathrm{d}y_{1}} - \frac {x_{1}\sum {\frac{\mathrm{d} L}{\mathrm{d} y_{j}}}x_{j}}{\sum x_{j}^{2}} }{\sqrt{\sum{x_{j}^{2}}}}
+          $$
+        )DOC");
+  }
+};
+
+class NormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of NormOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"),
+                   "Input(Scale) of NormOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of NormOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", in_x_dims);
+  }
+};
+
+class NormOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker<float>, norm_grad,
+            ops::NormOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    norm, ops::NormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::NormKernel<paddle::platform::CPUDeviceContext, double, float>);
+REGISTER_OP_CPU_KERNEL(
+    norm_grad, ops::NormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::NormGradKernel<paddle::platform::CPUDeviceContext, double, float>);
diff --git a/paddle/operators/norm_op.cu b/paddle/operators/norm_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2941c89b93141388d3746cf128848960ed6f3625
--- /dev/null
+++ b/paddle/operators/norm_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    norm, ops::NormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::NormKernel<paddle::platform::CUDADeviceContext, double, float>);
+REGISTER_OP_CUDA_KERNEL(
+    norm_grad, ops::NormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::NormGradKernel<paddle::platform::CUDADeviceContext, double, float>);
diff --git a/paddle/operators/norm_op.h b/paddle/operators/norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5759d6f1f07d50d4c65c93578ba7d7916a2d9b4e
--- /dev/null
+++ b/paddle/operators/norm_op.h
@@ -0,0 +1,175 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class NormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
+    out->mutable_data<T>(context.GetPlace());
+    int batch_size = in_x->dims()[0];
+    int channels = in_x->dims()[1];
+    int height = in_x->dims()[2];
+    int width = in_x->dims()[3];
+    int fea_len = height * width;
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+    auto x =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
+    // get square
+    framework::Tensor x_square;
+    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
+    auto x_square_eigen =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            x_square, framework::make_ddim({batch_size, fea_len * channels}));
+    x_square_eigen.device(*place) = x.square();
+    auto scale_eigen =
+        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
+            *scale);
+    for (int n = 0; n < batch_size; ++n) {
+      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
+      auto in_x_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              in_x_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
+      auto x_square_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              x_square_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor out_batch = out->Slice(n, n + 1);
+      auto out_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              out_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor tmp_tensor;
+      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
+                                 context.GetPlace());
+      auto tmp = framework::EigenVector<T, Eigen::RowMajor,
+                                        Eigen::DenseIndex>::Flatten(tmp_tensor);
+      // get colsum and sqrt , inverse
+      auto dim = Eigen::array<int, 1>({{0}});
+      tmp.device(*place) = x_square_batch_eigen.sum(dim);
+      tmp.device(*place) = (tmp + epsilon).sqrt().inverse();
+      Eigen::array<int, 2> broadcast_dim_col;
+      broadcast_dim_col[1] = 1;
+      broadcast_dim_col[0] = channels;
+      out_batch_eigen.device(*place) =
+          in_x_batch_eigen * (tmp.broadcast(broadcast_dim_col));
+      Eigen::array<int, 2> broadcast_dim_row;
+      broadcast_dim_row[1] = fea_len;
+      broadcast_dim_row[0] = 1;
+      out_batch_eigen.device(*place) =
+          out_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
+    }
+  }
+};
+template <typename DeviceContext, typename T, typename AttrType = T>
+class NormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    in_x_grad->mutable_data<T>(context.GetPlace());
+    int batch_size = in_x->dims()[0];
+    int channels = in_x->dims()[1];
+    int height = in_x->dims()[2];
+    int width = in_x->dims()[3];
+    int fea_len = height * width;
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+
+    auto scale_eigen =
+        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
+            *scale);
+    auto x =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
+    // get square
+    framework::Tensor x_square;
+    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
+    auto x_square_eigen =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            x_square, framework::make_ddim({batch_size, fea_len * channels}));
+    x_square_eigen.device(*place) = x.square();
+
+    for (int n = 0; n < batch_size; ++n) {
+      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
+      auto in_x_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              in_x_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor in_g_batch = in_x_grad->Slice(n, n + 1);
+      auto in_g_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              in_g_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
+      auto x_square_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              x_square_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor outg_batch = out_grad->Slice(n, n + 1);
+      auto outg_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              outg_batch, framework::make_ddim({channels, fea_len}));
+
+      framework::Tensor tmp_tensor;
+      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
+                                 context.GetPlace());
+      auto tmp_eigen =
+          framework::EigenVector<T, Eigen::RowMajor,
+                                 Eigen::DenseIndex>::Flatten(tmp_tensor);
+      auto dim = Eigen::array<int, 1>({{0}});
+      tmp_eigen.device(*place) = (in_x_batch_eigen * outg_batch_eigen).sum(dim);
+      framework::Tensor norm_tmp_tensor;
+      norm_tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
+                                      context.GetPlace());
+      auto norm_tmp_eigen =
+          framework::EigenVector<T, Eigen::RowMajor,
+                                 Eigen::DenseIndex>::Flatten(norm_tmp_tensor);
+      norm_tmp_eigen.device(*place) =
+          (x_square_batch_eigen.sum(dim) + epsilon).sqrt();
+      Eigen::array<int, 2> broadcast_dim_col;
+      broadcast_dim_col[1] = 1;
+      broadcast_dim_col[0] = channels;
+      in_g_batch_eigen.device(*place) =
+          in_x_batch_eigen * tmp_eigen.broadcast(broadcast_dim_col);
+      in_g_batch_eigen.device(*place) =
+          in_g_batch_eigen /
+          (norm_tmp_eigen * norm_tmp_eigen).broadcast(broadcast_dim_col);
+      in_g_batch_eigen.device(*place) = outg_batch_eigen - in_g_batch_eigen;
+      // outg_batch_eigen + (in_g_batch_eigen * -1);
+      in_g_batch_eigen.device(*place) =
+          in_g_batch_eigen / norm_tmp_eigen.broadcast(broadcast_dim_col);
+      Eigen::array<int, 2> broadcast_dim_row;
+      broadcast_dim_row[1] = fea_len;
+      broadcast_dim_row[0] = 1;
+      in_g_batch_eigen.device(*place) =
+          in_g_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/batch_norm_op.md b/paddle/operators/op_documentation/batch_norm_op.md
similarity index 98%
rename from paddle/operators/batch_norm_op.md
rename to paddle/operators/op_documentation/batch_norm_op.md
index 80948adf2b9047a9685dbdd90b2296b5a955f9c1..d1392619c42d9206bf4bddcd33ad11b033e6cbdb 100644
--- a/paddle/operators/batch_norm_op.md
+++ b/paddle/operators/op_documentation/batch_norm_op.md
@@ -66,7 +66,7 @@ As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attribu
 
 The following graph showes the training computational process of `batch_norm_op`:
 
-<img src="./images/batch_norm_op_kernel.png" width="800"/>
+<img src="../images/batch_norm_op_kernel.png" width="800"/>
 
 cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
 
@@ -124,7 +124,7 @@ for pass_id in range(PASS_NUM):
 `is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
 
 <div align=center>
-<img src="./images/batch_norm_fork.png" width="500"/>
+<img src="../images/batch_norm_fork.png" width="500"/>
 </div>
 
 Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate. 
diff --git a/paddle/operators/name_convention.md b/paddle/operators/op_documentation/name_convention.md
similarity index 96%
rename from paddle/operators/name_convention.md
rename to paddle/operators/op_documentation/name_convention.md
index b5cb176e003b4584321142ac9f1c3380b7010936..a02b356f058da68442516c2705d0bac140f8ef18 100644
--- a/paddle/operators/name_convention.md
+++ b/paddle/operators/op_documentation/name_convention.md
@@ -35,8 +35,8 @@ Here we give some examples to show how these rules will be used.
 ```c++
 class AccumulateOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AccumulateOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  AccumulateOpMaker(OpProto *proto,
+                    OpAttrChecker *op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor.
     If the output size is not the same as input size,
diff --git a/paddle/operators/net_op_design.md b/paddle/operators/op_documentation/net_op_design.md
similarity index 100%
rename from paddle/operators/net_op_design.md
rename to paddle/operators/op_documentation/net_op_design.md
diff --git a/paddle/operators/op_documentation/op_markdown_format.md b/paddle/operators/op_documentation/op_markdown_format.md
new file mode 100644
index 0000000000000000000000000000000000000000..0ee804d592252c727622cbe59b0644813db3c4fd
--- /dev/null
+++ b/paddle/operators/op_documentation/op_markdown_format.md
@@ -0,0 +1,64 @@
+# Standard Markdown Format for Operators
+The following should be the standard format for documentation for all the operators that will get rendered in the `html`:
+
+```
+Operator Name (In PaddlePaddle)
+
+Operator Name (Standard)
+
+Operator description.
+
+LaTeX equation of how the operator performs an update.
+
+The signature of the operator.
+```
+
+Each section mentioned above has been covered in further detail in the rest of the document.
+
+# PaddlePaddle Operator Name
+This should be in all small letters, in case of multiple words, we separate them with an underscore. For example:
+`array to lod tensor` should be written as `array_to_lod_tensor`.
+
+This naming convention should be standard across all PaddlePaddle operators.
+
+# Standard Operator Name
+This is the standard name of the operator as used in the community. The general standard is usually:
+- Standard abbreviations like `SGD` are written in all capital letters.
+- Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word).
+- Keep numbers inside a word as is, with no boundary delimiters.
+- Follow the name of the operator with the keyword: `Activation Operator.`
+
+# Operator description
+This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section.
+
+# LaTeX equation
+This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`).
+
+# The signature
+This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is:
+`Section :
+VariableName : (VariableType) VariableDescription
+...
+...
+`
+
+
+The following example for an `sgd` operator covers the above mentioned sections as they would ideally look like in the `html`:
+
+```
+sgd
+
+SGD operator
+
+This operator implements one step of the stochastic gradient descent algorithm.
+
+param_out = param_learning_rate * grad
+
+Inputs:
+Param : (Tensor) Input parameter
+LearningRate : (Tensor) Learning rate of SGD
+Grad : (Tensor) Input gradient
+
+Outputs:
+ParamOut : (Tensor) Output parameter
+```
diff --git a/paddle/operators/rnn_design.md b/paddle/operators/op_documentation/rnn_design.md
similarity index 100%
rename from paddle/operators/rnn_design.md
rename to paddle/operators/op_documentation/rnn_design.md
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index adb75df6ef10c59fc6f3db4d36e1ffb1ae0b4b1e..90c53bd17732aa046beceb3ac0a3b8c0d69994f3 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/pad_op.h"
 
@@ -48,7 +48,7 @@ class PadOp : public framework::OperatorWithKernel {
 
 class PadOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PadOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  PadOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input of pad op. "
@@ -116,14 +116,14 @@ class PadOpGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto* bind = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* bind = new framework::OpDesc();
     bind->SetInput("X", Input("X"));
     bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     bind->SetAttrMap(Attrs());
     bind->SetType("pad_grad");
-    return std::unique_ptr<framework::OpDescBind>(bind);
+    return std::unique_ptr<framework::OpDesc>(bind);
   }
 };
 
@@ -134,6 +134,7 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker);
 REGISTER_OPERATOR(pad_grad, ops::PadOpGrad);
-REGISTER_OP_CPU_KERNEL(pad, ops::PadKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(pad_grad,
-                       ops::PadGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    pad, ops::PadKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    pad_grad, ops::PadGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/pad_op.cu b/paddle/operators/pad_op.cu
index 555a7dba23c6fa2659cabf4858b42ff70d74bf18..433b5f1112a27b36edbe6d99fcdd4fc8395bc2e8 100644
--- a/paddle/operators/pad_op.cu
+++ b/paddle/operators/pad_op.cu
@@ -1,21 +1,22 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/pad_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(pad, ops::PadKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(pad_grad,
-                       ops::PadGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    pad, ops::PadKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    pad_grad, ops::PadGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/pad_op.h b/paddle/operators/pad_op.h
index 9534dbf54529e3b9ae2b6640d51fe291e9521927..fdf91a5776620485c38a8b2c5f8b26039e438d0c 100644
--- a/paddle/operators/pad_op.h
+++ b/paddle/operators/pad_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -26,7 +26,7 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 
-template <typename Place, typename T, size_t D>
+template <typename DeviceContext, typename T, size_t D>
 void PadFunction(const framework::ExecutionContext& context) {
   auto pads = context.Attr<std::vector<int>>("paddings");
   Eigen::array<std::pair<int, int>, D> paddings;
@@ -42,33 +42,34 @@ void PadFunction(const framework::ExecutionContext& context) {
 
   auto x_tensor = EigenTensor<T, D>::From(*x);
   auto out_tensor = EigenTensor<T, D>::From(*out);
-  auto place = context.GetEigenDevice<Place>();
+  auto& place =
+      *context.template device_context<DeviceContext>().eigen_device();
   out_tensor.device(place) = x_tensor.pad(paddings, pad_value);
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PadKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     int rank = context.Input<Tensor>("X")->dims().size();
     switch (rank) {
       case 1:
-        PadFunction<Place, T, 1>(context);
+        PadFunction<DeviceContext, T, 1>(context);
         break;
       case 2:
-        PadFunction<Place, T, 2>(context);
+        PadFunction<DeviceContext, T, 2>(context);
         break;
       case 3:
-        PadFunction<Place, T, 3>(context);
+        PadFunction<DeviceContext, T, 3>(context);
         break;
       case 4:
-        PadFunction<Place, T, 4>(context);
+        PadFunction<DeviceContext, T, 4>(context);
         break;
       case 5:
-        PadFunction<Place, T, 5>(context);
+        PadFunction<DeviceContext, T, 5>(context);
         break;
       case 6:
-        PadFunction<Place, T, 6>(context);
+        PadFunction<DeviceContext, T, 6>(context);
         break;
       default:
         PADDLE_THROW(
@@ -77,7 +78,7 @@ class PadKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T, size_t D>
+template <typename DeviceContext, typename T, size_t D>
 void PadGradFunction(const framework::ExecutionContext& context) {
   auto pads = context.Attr<std::vector<int>>("paddings");
   Eigen::array<std::pair<int, int>, D> paddings;
@@ -91,12 +92,13 @@ void PadGradFunction(const framework::ExecutionContext& context) {
     d_x->mutable_data<T>(context.GetPlace());
     auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
     auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0);
   }
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PadGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -104,22 +106,22 @@ class PadGradKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
     switch (rank) {
       case 1:
-        PadGradFunction<Place, T, 1>(context);
+        PadGradFunction<DeviceContext, T, 1>(context);
         break;
       case 2:
-        PadGradFunction<Place, T, 2>(context);
+        PadGradFunction<DeviceContext, T, 2>(context);
         break;
       case 3:
-        PadGradFunction<Place, T, 3>(context);
+        PadGradFunction<DeviceContext, T, 3>(context);
         break;
       case 4:
-        PadGradFunction<Place, T, 4>(context);
+        PadGradFunction<DeviceContext, T, 4>(context);
         break;
       case 5:
-        PadGradFunction<Place, T, 5>(context);
+        PadGradFunction<DeviceContext, T, 5>(context);
         break;
       case 6:
-        PadGradFunction<Place, T, 6>(context);
+        PadGradFunction<DeviceContext, T, 6>(context);
         break;
       default:
         PADDLE_THROW(
diff --git a/paddle/operators/parallel_do_op.cc b/paddle/operators/parallel_do_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e1bec0421e76143bef669a4f6fa373cdf01226b2
--- /dev/null
+++ b/paddle/operators/parallel_do_op.cc
@@ -0,0 +1,313 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "paddle/framework/executor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/threadpool.h"
+
+namespace paddle {
+namespace operators {
+
+static constexpr char kInputs[] = "inputs";
+static constexpr char kParameters[] = "parameters";
+static constexpr char kPlaces[] = "places";
+
+static constexpr char kOutputs[] = "outputs";
+static constexpr char kParallelScopes[] = "parallel_scopes";
+
+static constexpr char kParallelBlock[] = "sub_block";
+
+// using ParallelScopeVar = std::vector<framework::Scope *>;
+using LoDTensor = framework::LoDTensor;
+using OperatorBase = framework::OperatorBase;
+
+void SplitTensorAndMoveTensorToScopes(
+    const framework::Scope &scope,
+    const std::vector<framework::Scope *> &sub_scopes,
+    const std::vector<platform::Place> &places,
+    const std::vector<std::string> &names) {
+  PADDLE_ENFORCE_EQ(sub_scopes.size(), places.size());
+  for (auto &argu : names) {
+    auto *var = scope.FindVar(argu);
+    const auto &tensor = var->Get<LoDTensor>();
+    auto lod_tensors = tensor.SplitLoDTensor(places);
+
+    for (auto &lod : lod_tensors) {
+      VLOG(3) << lod.dims();
+    }
+
+    for (size_t i = 0; i < sub_scopes.size(); ++i) {
+      *sub_scopes[i]->Var(argu)->GetMutable<LoDTensor>() = lod_tensors[i];
+    }
+  }
+}
+
+void WaitOnPlaces(const std::vector<platform::Place> places) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+
+  for (auto &place : places) {
+    auto &dev_ctx = *pool.Get(place);
+    dev_ctx.Wait();
+  }
+}
+
+class ParallelDoOp : public framework::OperatorBase {
+ public:
+  ParallelDoOp(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
+    auto *program = block->Program();
+
+    auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
+
+    auto &sub_scopes = *scope.FindVar(Output(kParallelScopes))
+                            ->GetMutable<std::vector<framework::Scope *>>();
+    for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
+      sub_scopes.push_back(&scope.NewScope());
+    }
+
+    // split input
+    SplitTensorAndMoveTensorToScopes(scope, sub_scopes, places,
+                                     Inputs(kInputs));
+    // copy parameter
+    for (auto &param : Inputs(kParameters)) {
+      PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(),
+                     "Only support parameter type as LoDTensor");
+      auto &src = scope.FindVar(param)->Get<LoDTensor>();
+      for (size_t i = 0; i < places.size(); ++i) {
+        auto &place = places[i];
+        auto *sub_scope = sub_scopes[i];
+        auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
+        framework::Copy(src, place, dst);
+      }
+    }
+    WaitOnPlaces(places);
+
+    std::vector<std::future<void>> workers;
+    workers.reserve(places.size());
+    for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
+      VLOG(3) << "Run " << place_idx;
+
+      auto &place = places[place_idx];
+      auto *cur_scope = sub_scopes[place_idx];
+
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
+    }
+    for (auto &worker : workers) {
+      worker.wait();
+    }
+    WaitOnPlaces(places);
+
+    // merge output
+    for (auto &o_name : Outputs(kOutputs)) {
+      std::vector<const framework::LoDTensor *> lod_tensors;
+      lod_tensors.reserve(sub_scopes.size());
+      for (auto *sub_scope : sub_scopes) {
+        lod_tensors.emplace_back(&sub_scope->FindVar(o_name)->Get<LoDTensor>());
+      }
+
+      auto *lod_tensor_to_be_merged =
+          scope.FindVar(o_name)->GetMutable<LoDTensor>();
+      lod_tensor_to_be_merged->MergeLoDTensor(lod_tensors, dev_ctx.GetPlace());
+    }
+    WaitOnPlaces(places);
+  }
+};
+
+class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ParallelDoOpProtoMaker(OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kInputs, "").AsDuplicable();
+    AddInput(kParameters, "").AsDuplicable();
+    AddInput(kPlaces, "");
+    AddOutput(kOutputs, "").AsDuplicable();
+    AddOutput(kParallelScopes, "");
+    AddAttr<framework::BlockDesc *>(kParallelBlock, "");
+    AddComment(R"DOC(
+ParallelDo Operator.
+)DOC");
+  }
+};
+
+class ParallelDoGradOp : public OperatorBase {
+ public:
+  ParallelDoGradOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    // // get device context from pool
+    // platform::DeviceContextPool &pool =
+    //        platform::DeviceContextPool::Instance();
+    // auto &dev_ctx = *pool.Get(place);
+
+    auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
+    auto *program = block->Program();
+
+    auto &sub_scopes = scope.FindVar(Input(kParallelScopes))
+                           ->Get<std::vector<framework::Scope *>>();
+
+    auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
+
+    // feed output@grad
+    SplitTensorAndMoveTensorToScopes(scope, sub_scopes, places,
+                                     Inputs(framework::GradVarName(kOutputs)));
+    WaitOnPlaces(places);
+
+    // for debugging
+    for (auto &s : Inputs(framework::GradVarName(kOutputs))) {
+      VLOG(3) << s;
+      VLOG(3) << scope.FindVar(s)->Get<LoDTensor>();
+      for (auto *sub_scope : sub_scopes) {
+        VLOG(3) << sub_scope->FindVar(s)->Get<LoDTensor>();
+      }
+    }
+
+    // exe run
+    std::vector<std::future<void>> workers;
+    for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
+      VLOG(3) << "Run " << place_idx;
+
+      auto &place = places[place_idx];
+      auto *cur_scope = sub_scopes[place_idx];
+
+      // execute
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
+    }
+    for (auto &worker : workers) {
+      worker.wait();
+    }
+    WaitOnPlaces(places);
+
+    // merge grad
+    for (auto &s : Outputs(framework::GradVarName(kParameters))) {
+      VLOG(3) << "merge grad " << s;
+
+      auto &t = sub_scopes[0]->FindVar(s)->Get<LoDTensor>();
+      VLOG(3) << t;
+
+      std::string s_buf = s + "@BUF";
+      auto *t_buf = sub_scopes[0]->Var(s_buf)->GetMutable<LoDTensor>();
+
+      for (size_t place_idx = 1; place_idx < places.size(); ++place_idx) {
+        auto &tt = sub_scopes[place_idx]->FindVar(s)->Get<LoDTensor>();
+        VLOG(3) << place_idx;
+        VLOG(3) << tt;
+        framework::Copy(tt, places[0], t_buf);
+
+        auto sum_op = framework::OpRegistry::CreateOp(
+            "sum", {{"X", {s, s_buf}}}, {{"Out", {s}}},
+            framework::AttributeMap{});
+        sum_op->Run(*sub_scopes[0], places[0]);
+        WaitOnPlaces(places);
+      }
+
+      VLOG(3) << t;
+      framework::Copy(t, place, scope.FindVar(s)->GetMutable<LoDTensor>());
+    }
+  }
+};
+
+class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  virtual std::unique_ptr<framework::OpDesc> Apply() const {
+    auto *grad = new framework::OpDesc();
+    grad->SetType("parallel_do_grad");
+    for (auto &input_param : this->InputNames()) {
+      VLOG(3) << input_param;
+      grad->SetInput(input_param, this->Input(input_param));
+      if (input_param != kPlaces) {
+        grad->SetOutput(framework::GradVarName(input_param),
+                        this->InputGrad(input_param, false));
+      }
+    }
+
+    for (auto &output_param : this->OutputNames()) {
+      if (output_param == kParallelScopes) {
+        grad->SetInput(output_param, this->Output(output_param));
+        grad->SetInput(framework::GradVarName(output_param),
+                       this->Output(output_param));
+      } else {
+        grad->SetInput(output_param, this->Output(output_param));
+        grad->SetInput(framework::GradVarName(output_param),
+                       this->OutputGrad(output_param));
+      }
+    }
+    grad->SetAttrMap(this->Attrs());
+    grad->SetBlockAttr(kParallelBlock, *grad_block_[0]);
+
+    return std::unique_ptr<framework::OpDesc>(grad);
+  }
+};
+
+class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    std::vector<std::string> input{kParameters, kInputs};
+    std::vector<std::string> output{kOutputs};
+    for (auto &s : input) {
+      PADDLE_ENFORCE(ctx->HasInputs(s));
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)),
+                     "Cannot find the gradient variable %s",
+                     framework::GradVarName(s));
+    }
+    for (auto &s : output) {
+      PADDLE_ENFORCE(ctx->HasInputs(s));
+    }
+    for (auto &s : input) {
+      ctx->SetOutputsDim(framework::GradVarName(s), ctx->GetInputsDim(s));
+    }
+    if (ctx->HasInputs(kParameters)) {
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
+      ctx->SetOutputsDim(framework::GradVarName(kParameters),
+                         ctx->GetInputsDim(kParameters));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp,
+                  paddle::operators::ParallelDoOpProtoMaker,
+                  paddle::operators::ParallelDoGradOpDescMaker);
+REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp,
+                  paddle::operators::ParallelDoGradOpShapeInference);
diff --git a/paddle/operators/pool_cudnn_op.cc b/paddle/operators/pool_cudnn_op.cc
deleted file mode 100644
index be9fcc5661f420aadf908cf80cce6c963008b0e4..0000000000000000000000000000000000000000
--- a/paddle/operators/pool_cudnn_op.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/pool_cudnn_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP(pool2d_cudnn, ops::PoolOp, ops::Pool2dOpMaker, pool2d_cudnn_grad,
-            ops::PoolOpGrad);
-
-REGISTER_OP_CPU_KERNEL(pool2d_cudnn,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(pool2d_cudnn_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>)
-
-REGISTER_OP(pool3d_cudnn, ops::PoolOp, ops::Pool3dOpMaker, pool3d_cudnn_grad,
-            ops::PoolOpGrad);
-
-REGISTER_OP_CPU_KERNEL(pool3d_cudnn,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(pool3d_cudnn_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>)
diff --git a/paddle/operators/pool_cudnn_op.cu.cc b/paddle/operators/pool_cudnn_op.cu.cc
index 66dd194ccd5ed629c5861552a7c124dc911362d7..446fb0819d98e0eb3d81bb202a67c55a23fc06b6 100644
--- a/paddle/operators/pool_cudnn_op.cu.cc
+++ b/paddle/operators/pool_cudnn_op.cu.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/pool_cudnn_op.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/pool_op.h"
 #include "paddle/platform/cudnn_helper.h"
 
 namespace paddle {
@@ -25,11 +26,11 @@ using DataLayout = platform::DataLayout;
 using PoolingMode = platform::PoolingMode;
 
 template <typename T>
-class PoolCudnnOpKernel : public framework::OpKernel<T> {
+class PoolCUDNNOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
 
     const Tensor *input = ctx.Input<Tensor>("X");
     Tensor *output = ctx.Output<Tensor>("Out");
@@ -86,11 +87,11 @@ class PoolCudnnOpKernel : public framework::OpKernel<T> {
 };
 
 template <typename T>
-class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
+class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
 
     const Tensor *input = ctx.Input<Tensor>("X");
     const Tensor *output = ctx.Input<Tensor>("Out");
@@ -162,12 +163,16 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel<float>,
-                       ops::PoolCudnnOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>,
-                       ops::PoolCudnnGradOpKernel<double>);
-
-REGISTER_OP_GPU_KERNEL(pool3d_cudnn, ops::PoolCudnnOpKernel<float>,
-                       ops::PoolCudnnOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(pool3d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>,
-                       ops::PoolCudnnGradOpKernel<double>);
+REGISTER_OP_KERNEL(pool2d, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::PoolCUDNNOpKernel<float>,
+                   ops::PoolCUDNNOpKernel<double>);
+REGISTER_OP_KERNEL(pool2d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::PoolCUDNNGradOpKernel<float>,
+                   ops::PoolCUDNNGradOpKernel<double>);
+
+REGISTER_OP_KERNEL(pool3d, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::PoolCUDNNOpKernel<float>,
+                   ops::PoolCUDNNOpKernel<double>);
+REGISTER_OP_KERNEL(pool3d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::PoolCUDNNGradOpKernel<float>,
+                   ops::PoolCUDNNGradOpKernel<double>);
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index d8c58618cf703d086d3cabc927ebc5eb038b1aec..3e567efd082ed913ce3c19f87c93a2868ebe8864 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -58,6 +58,24 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
         OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
   }
   ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  ctx->ShareLoD("X", "Out");
+}
+
+framework::OpKernelType PoolOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+      layout_, library_);
 }
 
 void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
@@ -67,8 +85,24 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
   ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
 }
 
-Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+
+Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput(
       "X",
@@ -101,15 +135,27 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
   AddAttr<std::vector<int>>("strides",
                             "(vector<int>, default {1, 1}), strides(height, "
                             "width) of pooling operator.")
-      .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
+      .SetDefault({1, 1});
+  // TODO(Chengduo): Add checker. (Currently,
   // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector<int>, defalut {0,0}), paddings(height, width) of pooling "
+      "(vector<int>, default {0,0}), paddings(height, width) of pooling "
       "operator."
       "If global_pooling = true, paddings and ksize will be ignored.")
-      .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
-  // TypedAttrChecker don't support vector type.)
+      .SetDefault({0, 0});
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
 
   AddComment(R"DOC(
 Pool2d Operator.
@@ -122,22 +168,21 @@ Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
-Example:
+Example:   
   Input:
        X shape: $(N, C, H_{in}, W_{in})$
   Output:
        Out shape: $(N, C, H_{out}, W_{out})$
-  where 
+  Where
        $$ 
-       H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
-       W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
        $$
 
 )DOC");
 }
 
-Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput("X",
            "(Tensor) The input tensor of pooling operator. "
@@ -177,12 +222,25 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
                                // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector<int>, defalut {0,0,0}), paddings(depth, height, "
+      "(vector<int>, default {0,0,0}), paddings(depth, height, "
       "width) of pooling operator. "
       "If global_pooling = true, ksize and paddings will be ignored.")
       .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
                                // TypedAttrChecker don't support vector type.)
 
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
+
   AddComment(R"DOC(
 Pool3d Operator.
 
@@ -199,12 +257,12 @@ Example:
        X shape: $(N, C, D_{in}, H_{in}, W_{in})$
   Output:
        Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
-  where
-       $$
-       D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
-       H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\
-       W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1
-       $$
+  Where
+  $$
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
+  $$
 
 )DOC");
 }
@@ -216,19 +274,19 @@ namespace ops = paddle::operators;
 REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
             ops::PoolOpGrad);
 
-REGISTER_OP_CPU_KERNEL(pool2d,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(pool2d_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>)
+REGISTER_OP_CPU_KERNEL(
+    pool2d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    pool2d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>)
 
 REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
             ops::PoolOpGrad);
 
-REGISTER_OP_CPU_KERNEL(pool3d,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(pool3d_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    pool3d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    pool3d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/pool_op.cu.cc b/paddle/operators/pool_op.cu.cc
index 1010cb762289dd39cd632c699f7528f4ba638278..39a9dfbf794b3dbaf81e2435f8609014dc27f3af 100644
--- a/paddle/operators/pool_op.cu.cc
+++ b/paddle/operators/pool_op.cu.cc
@@ -16,16 +16,18 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(pool2d,
-                       ops::PoolKernel<paddle::platform::GPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(pool2d_grad,
-                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pool2d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pool2d_grad,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
 
-REGISTER_OP_GPU_KERNEL(pool3d,
-                       ops::PoolKernel<paddle::platform::GPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(pool3d_grad,
-                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pool3d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pool3d_grad,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
index 63492a89e8d4e44a036bc3c2b16cc54c7e77b534..c3d82ecbdeb412f0234fcddc27361d79b58c7122 100644
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -29,6 +29,10 @@ class PoolOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
 };
 
 class PoolOpGrad : public framework::OperatorWithKernel {
@@ -36,21 +40,23 @@ class PoolOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
 };
 
 class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Pool2dOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Pool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
 class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Pool3dOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Pool3dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -67,41 +73,41 @@ class PoolKernel : public framework::OpKernel<T> {
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
       }
     }
-
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     switch (ksize.size()) {
       case 2: {
         if (pooling_type == "max") {
           paddle::operators::math::Pool2dFunctor<
-              Place, paddle::operators::math::MaxPool<T>, T>
+              DeviceContext, paddle::operators::math::MaxPool<T>, T>
               pool2d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(context.device_context(), *in_x, ksize, strides,
-                         paddings, pool_process, out);
+          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
 
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool2dFunctor<
-              Place, paddle::operators::math::AvgPool<T>, T>
+              DeviceContext, paddle::operators::math::AvgPool<T>, T>
               pool2d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
-          pool2d_forward(context.device_context(), *in_x, ksize, strides,
-                         paddings, pool_process, out);
+          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
         }
       } break;
       case 3: {
         if (pooling_type == "max") {
           paddle::operators::math::Pool3dFunctor<
-              Place, paddle::operators::math::MaxPool<T>, T>
+              DeviceContext, paddle::operators::math::MaxPool<T>, T>
               pool3d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
-          pool3d_forward(context.device_context(), *in_x, ksize, strides,
-                         paddings, pool_process, out);
+          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool3dFunctor<
-              Place, paddle::operators::math::AvgPool<T>, T>
+              DeviceContext, paddle::operators::math::AvgPool<T>, T>
               pool3d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
-          pool3d_forward(context.device_context(), *in_x, ksize, strides,
-                         paddings, pool_process, out);
+          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
         }
       } break;
       default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
@@ -109,7 +115,7 @@ class PoolKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -130,42 +136,43 @@ class PoolGradKernel : public framework::OpKernel<T> {
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
       }
     }
-
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
       auto temp = framework::EigenVector<T>::Flatten(*in_x_grad);
-      temp.device(context.GetEigenDevice<Place>()) =
+      temp.device(
+          *context.template device_context<DeviceContext>().eigen_device()) =
           temp.constant(static_cast<T>(0));
 
       switch (ksize.size()) {
         case 2: {
           if (pooling_type == "max") {
-            paddle::operators::math::MaxPool2dGradFunctor<Place, T>
+            paddle::operators::math::MaxPool2dGradFunctor<DeviceContext, T>
                 pool2d_backward;
-            pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
-                            ksize, strides, paddings, in_x_grad);
+            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, in_x_grad);
           } else if (pooling_type == "avg") {
             paddle::operators::math::Pool2dGradFunctor<
-                Place, paddle::operators::math::AvgPoolGrad<T>, T>
+                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
                 pool2d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
-                            ksize, strides, paddings, pool_process, in_x_grad);
+            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, pool_process, in_x_grad);
           }
         } break;
         case 3: {
           if (pooling_type == "max") {
-            paddle::operators::math::MaxPool3dGradFunctor<Place, T>
+            paddle::operators::math::MaxPool3dGradFunctor<DeviceContext, T>
                 pool3d_backward;
-            pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
-                            ksize, strides, paddings, in_x_grad);
+            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, in_x_grad);
           } else if (pooling_type == "avg") {
             paddle::operators::math::Pool3dGradFunctor<
-                Place, paddle::operators::math::AvgPoolGrad<T>, T>
+                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
                 pool3d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
-                            ksize, strides, paddings, pool_process, in_x_grad);
+            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, pool_process, in_x_grad);
           }
         } break;
         default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index 4958fa645405db0798f37165030eae95da371477..1d31d813af4ec4eb829b906fa9add38cc71d54f3 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -69,7 +69,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
@@ -90,7 +90,7 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
@@ -100,8 +100,7 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
 
 class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxPool2dWithIndexOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  MaxPool2dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
@@ -142,7 +141,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     // TypedAttrChecker don't support vector type.)
     AddAttr<std::vector<int>>(
         "paddings",
-        "(vector<int>, defalut:{0, 0}), paddings(height, width) of pooling "
+        "(vector<int>, default:{0, 0}), paddings(height, width) of pooling "
         "operator. "
         "If global_pooling = true, paddings and will be ignored.")
         .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -166,10 +165,10 @@ Example:
   Output:
        Out shape: $(N, C, H_{out}, W_{out})$
        Mask shape: $(N, C, H_{out}, W_{out})$
-  where
+  Where
        $$
-       H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
-       W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
        $$
 
 )DOC");
@@ -178,8 +177,7 @@ Example:
 
 class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxPool3dWithIndexOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  MaxPool3dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor) The input tensor of pooling operator. "
@@ -220,7 +218,7 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     // TypedAttrChecker don't support vector type.)
     AddAttr<std::vector<int>>(
         "paddings",
-        "(vector, defalut {0,0,0}), paddings(depth, "
+        "(vector, default {0,0,0}), paddings(depth, "
         "height, width) of pooling operator. "
         "If global_pooling = true, paddings and ksize will be ignored.")
         .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -244,11 +242,11 @@ Example:
   Output:
        Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
        Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$
-  where
+  Where
        $$
-       D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
-       H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\
-       W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
        $$
 
 )DOC");
@@ -266,12 +264,15 @@ REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
 
 REGISTER_OP_CPU_KERNEL(
     max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, double, int>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
+                                int>);
 REGISTER_OP_CPU_KERNEL(
     max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float, int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, double, int>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
+                                    int>)
 
 REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
             ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad,
@@ -279,9 +280,12 @@ REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
 
 REGISTER_OP_CPU_KERNEL(
     max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, double, int>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
+                                int>);
 REGISTER_OP_CPU_KERNEL(
     max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float, int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, double, int>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
+                                    int>)
diff --git a/paddle/operators/pool_with_index_op.cu.cc b/paddle/operators/pool_with_index_op.cu.cc
index 335064a7eea4ec15c529db5254cbb026ba575f3d..4c9804da639e3ad44f90963b53948cd8b755a6ac 100644
--- a/paddle/operators/pool_with_index_op.cu.cc
+++ b/paddle/operators/pool_with_index_op.cu.cc
@@ -16,20 +16,28 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, double, int>);
-REGISTER_OP_GPU_KERNEL(
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
+                                int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
+                                int>);
+REGISTER_OP_CUDA_KERNEL(
     max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float, int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, double, int>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
+                                    int>)
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, double, int>);
-REGISTER_OP_GPU_KERNEL(
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
+                                int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
+                                int>);
+REGISTER_OP_CUDA_KERNEL(
     max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float, int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, double, int>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
+                                    int>)
diff --git a/paddle/operators/pool_with_index_op.h b/paddle/operators/pool_with_index_op.h
index 40766c7e821e8b85aeda9473798a1f696d0ad719..4f4087d1dd36d6e91cdd9a9253dd72a71735e136 100644
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
@@ -24,7 +24,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -35,6 +35,8 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
@@ -44,23 +46,23 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
 
     switch (ksize.size()) {
       case 2: {
-        paddle::operators::math::MaxPool2dWithIndexFunctor<Place, T1, T2>
+        paddle::operators::math::MaxPool2dWithIndexFunctor<DeviceContext, T1,
+                                                           T2>
             pool2d_forward;
-        pool2d_forward(context.device_context(), *in_x, ksize, strides,
-                       paddings, out, mask);
+        pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
       } break;
       case 3: {
-        paddle::operators::math::MaxPool3dWithIndexFunctor<Place, T1, T2>
+        paddle::operators::math::MaxPool3dWithIndexFunctor<DeviceContext, T1,
+                                                           T2>
             pool3d_forward;
-        pool3d_forward(context.device_context(), *in_x, ksize, strides,
-                       paddings, out, mask);
+        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
       } break;
       default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
     }
   }
 };
 
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -81,18 +83,20 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
 
     if (in_x_grad) {
       in_x_grad->mutable_data<T1>(context.GetPlace());
-      auto& device_ctx = context.device_context();
+      auto& device_ctx = context.template device_context<DeviceContext>();
       math::set_constant(device_ctx, in_x_grad, 0);
 
       switch (ksize.size()) {
         case 2: {
-          paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T1, T2>
+          paddle::operators::math::MaxPool2dWithIndexGradFunctor<DeviceContext,
+                                                                 T1, T2>
               pool2d_backward;
           pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides,
                           paddings, in_x_grad);
         } break;
         case 3: {
-          paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T1, T2>
+          paddle::operators::math::MaxPool3dWithIndexGradFunctor<DeviceContext,
+                                                                 T1, T2>
               pool3d_backward;
           pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
                           paddings, in_x_grad);
diff --git a/paddle/operators/positive_negative_pair_op.cc b/paddle/operators/positive_negative_pair_op.cc
index 4ba40a62ec5f696ad980c2913f7e162879a557e2..5aa5167dbb83c9caf2e754859938e51700d8ec3f 100644
--- a/paddle/operators/positive_negative_pair_op.cc
+++ b/paddle/operators/positive_negative_pair_op.cc
@@ -85,7 +85,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Score")->type()),
@@ -95,8 +95,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
 
 class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PositiveNegativePairOpMaker(framework::OpProto *proto,
-                              framework::OpAttrChecker *op_checker)
+  PositiveNegativePairOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Score",
              "(Tensor, float) Model Score on an item (with "
@@ -155,13 +154,14 @@ class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
         "Noting that reducing on the first dim will make the LoD info lost.")
         .SetDefault(0);
     AddComment(R"DOC(
-        PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) 
-        model performance. 
-        Within some context, e.g. the "query", a LTR model generates scores
-        for a list of items, which gives a partial order of the items.
-        PositiveNegativePairOp takes a list of reference rank order 
-        (Input("Label")) and the model generated scores (Input(Score)) as 
-        inputs and counts the pairs that ranked correctly and incorrectly.
+PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) model's
+performance.
+
+Within some context, e.g. the "query", a LTR model generates scores for a list
+of items, which gives a partial order of the items. PositiveNegativePairOp
+takes a list of reference rank order (Input("Label")) and the model generated
+scores (Input(Score)) as inputs and counts the pairs that ranked correctly
+and incorrectly.
 )DOC");
   }
 };
diff --git a/paddle/operators/positive_negative_pair_op.h b/paddle/operators/positive_negative_pair_op.h
index 2efd3777e04c17b27c07bccde524de5785af35fe..977e59b7d2f771fc4c3412f0092f1eba92ef22da 100644
--- a/paddle/operators/positive_negative_pair_op.h
+++ b/paddle/operators/positive_negative_pair_op.h
@@ -22,7 +22,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PositiveNegativePairKernel : public framework::OpKernel<T> {
  public:
   struct PredictionResult {
diff --git a/paddle/operators/precision_recall_op.cc b/paddle/operators/precision_recall_op.cc
index 1ace4f2a5935dcb4239526c42599a42d288ff552..f1598d53cae2a58acd0207a20938b4f744ba0efe 100644
--- a/paddle/operators/precision_recall_op.cc
+++ b/paddle/operators/precision_recall_op.cc
@@ -80,7 +80,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type()),
@@ -90,8 +90,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
 
 class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PrecisionRecallOpMaker(framework::OpProto *proto,
-                         framework::OpAttrChecker *op_checker)
+  PrecisionRecallOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("MaxProbs",
              "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
diff --git a/paddle/operators/precision_recall_op.h b/paddle/operators/precision_recall_op.h
index 4a871ce6741469cf9af409ec90215f721d52f36c..c0d55405a362809f414b8dc3b12ed692f96c24e9 100644
--- a/paddle/operators/precision_recall_op.h
+++ b/paddle/operators/precision_recall_op.h
@@ -26,7 +26,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 enum StateVariable { TP = 0, FP, TN, FN };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PrecisionRecallKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
index 055c471b4561e5fd3c7a65c6f81d66cdce1a5578..ddc21a657024dcc800726475fa6242f8e6576ad1 100644
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/prelu_op.h"
 #include "paddle/operators/net_op.h"
@@ -38,7 +38,7 @@ class PReluOp : public framework::OperatorWithKernel {
 
 class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  PReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of prelu operator.");
     AddInput("Alpha", "The alpha weight of prelu operator.");
@@ -85,7 +85,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad,
             ops::PReluGradOp);
-REGISTER_OP_CPU_KERNEL(prelu,
-                       ops::PReluKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(prelu_grad,
-                       ops::PReluGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    prelu, ops::PReluKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    prelu_grad,
+    ops::PReluGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/prelu_op.cu b/paddle/operators/prelu_op.cu
index 9e391dabae735cc8a740b46b50d31d271f99b65d..1718bb5cd65f48eba391023e4374a30e405a164d 100644
--- a/paddle/operators/prelu_op.cu
+++ b/paddle/operators/prelu_op.cu
@@ -1,21 +1,22 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/prelu_op.h"
 
-REGISTER_OP_GPU_KERNEL(
-    prelu, paddle::operators::PReluKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    prelu_grad,
-    paddle::operators::PReluGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    prelu,
+    paddle::operators::PReluKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(prelu_grad,
+                        paddle::operators::PReluGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/prelu_op.h b/paddle/operators/prelu_op.h
index 5ad31c2203ae6c9bf6f48bb9ecf9a714597e7da8..56f9a553ec12d5bfa745af63ec0570ad30910628 100644
--- a/paddle/operators/prelu_op.h
+++ b/paddle/operators/prelu_op.h
@@ -39,7 +39,7 @@ class PReluFunctor {
   const T* alpha_;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PReluKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -54,9 +54,9 @@ class PReluKernel : public framework::OpKernel<T> {
 
     int numel = x->numel();
 
-    Transform<Place> trans;
-    trans(context.device_context(), x_ptr, x_ptr + numel, o_ptr,
-          PReluFunctor<T>(alpha_ptr));
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x_ptr,
+          x_ptr + numel, o_ptr, PReluFunctor<T>(alpha_ptr));
   }
 };
 
@@ -76,7 +76,7 @@ class PReluGradFunctor {
   const T* alpha_;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PReluGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -92,9 +92,9 @@ class PReluGradKernel : public framework::OpKernel<T> {
     const T* out_ptr = out->data<T>();
     int numel = dx->numel();
 
-    Transform<Place> trans;
-    trans(context.device_context(), out_ptr, out_ptr + numel, dout_ptr, dx_ptr,
-          PReluGradFunctor<T>(alpha_ptr));
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), out_ptr,
+          out_ptr + numel, dout_ptr, dx_ptr, PReluGradFunctor<T>(alpha_ptr));
 
     // TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready
   }
diff --git a/paddle/operators/print_op.cc b/paddle/operators/print_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8b233d64c904a8870212af33c5839cfc555b5dc8
--- /dev/null
+++ b/paddle/operators/print_op.cc
@@ -0,0 +1,283 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <algorithm>
+#include <ctime>
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/variable.h"
+
+namespace paddle {
+namespace operators {
+
+#define CLOG std::cout
+
+const std::string kForward = "FORWARD";
+const std::string kBackward = "BACKWARD";
+const std::string kBoth = "BOTH";
+
+struct Formater {
+  std::string message;
+  std::string name;
+  std::vector<int> dims;
+  std::type_index dtype{typeid(char)};
+  framework::LoD lod;
+  int summarize;
+  void* data{nullptr};
+
+  void operator()(size_t size) {
+    PrintMessage();
+    PrintName();
+    PrintDims();
+    PrintDtype();
+    PrintLod();
+    PrintData(size);
+  }
+
+ private:
+  void PrintMessage() { CLOG << std::time(nullptr) << "\t" << message; }
+  void PrintName() {
+    if (!name.empty()) {
+      CLOG << "Tensor[" << name << "]" << std::endl;
+    }
+  }
+  void PrintDims() {
+    if (!dims.empty()) {
+      CLOG << "\tshape: [";
+      for (auto i : dims) {
+        CLOG << i << ",";
+      }
+      CLOG << "]" << std::endl;
+    }
+  }
+  void PrintDtype() {
+    if (dtype.hash_code() != typeid(char).hash_code()) {
+      CLOG << "\tdtype: " << dtype.name() << std::endl;
+    }
+  }
+  void PrintLod() {
+    if (!lod.empty()) {
+      CLOG << "\tLoD: [";
+      for (auto level : lod) {
+        CLOG << "[ ";
+        for (auto i : level) {
+          CLOG << i << ",";
+        }
+        CLOG << " ]";
+      }
+      CLOG << "]" << std::endl;
+    }
+  }
+
+  void PrintData(size_t size) {
+    PADDLE_ENFORCE_NOT_NULL(data);
+    // print float
+    if (dtype.hash_code() == typeid(float).hash_code()) {
+      Display<float>(size);
+    }
+    if (dtype.hash_code() == typeid(double).hash_code()) {
+      Display<double>(size);
+    }
+    if (dtype.hash_code() == typeid(int).hash_code()) {
+      Display<int>(size);
+    }
+    if (dtype.hash_code() == typeid(int64_t).hash_code()) {
+      Display<int64_t>(size);
+    }
+  }
+
+  template <typename T>
+  void Display(size_t size) {
+    auto* d = (T*)data;
+    CLOG << "\tdata: ";
+    if (summarize != -1) {
+      summarize = std::min(size, (size_t)summarize);
+      for (int i = 0; i < summarize; i++) {
+        CLOG << d[i] << ",";
+      }
+    } else {
+      for (size_t i = 0; i < size; i++) {
+        CLOG << d[i] << ",";
+      }
+    }
+    CLOG << std::endl;
+  }
+};
+
+// TODO(ChunweiYan) there should be some other printers for TensorArray
+class TensorPrintOp : public framework::OperatorBase {
+ public:
+  TensorPrintOp(const std::string& type,
+                const framework::VariableNameMap& inputs,
+                const framework::VariableNameMap& outputs,
+                const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  TensorPrintOp(const TensorPrintOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    PADDLE_THROW("Not implemented.");
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override {
+    const framework::Variable* in_var_ptr = nullptr;
+    std::string phase = kForward;
+    std::string printed_var_name = "";
+
+    auto& inputs = Inputs();
+    if (inputs.find("In") != inputs.end() && !Inputs("In").empty()) {
+      in_var_ptr = scope.FindVar(Input("In"));
+      printed_var_name = Inputs("In").front();
+    } else if (inputs.find("In@GRAD") != inputs.end() &&
+               !Inputs("In@GRAD").empty()) {
+      in_var_ptr = scope.FindVar(Input("In@GRAD"));
+      printed_var_name = Inputs("In@GRAD").front();
+      phase = kBackward;
+    } else {
+      PADDLE_THROW("Unknown phase, should be forward or backward.");
+    }
+
+    PADDLE_ENFORCE_NOT_NULL(in_var_ptr);
+
+    auto& in_tensor = in_var_ptr->Get<framework::LoDTensor>();
+    auto* out_var_ptr = scope.FindVar(Output("Out"));
+    auto& out_tensor = *out_var_ptr->GetMutable<framework::LoDTensor>();
+
+    // Just copy data from input tensor to output tensor
+    // output tensor share same memory with input tensor
+    out_tensor.ShareDataWith(in_tensor);
+    out_tensor.set_lod(in_tensor.lod());
+
+    std::string print_phase = Attr<std::string>("print_phase");
+    if (print_phase != phase && print_phase != kBoth) {
+      return;
+    }
+
+    int first_n = Attr<int>("first_n");
+    if (first_n > 0 && ++times_ > first_n) return;
+
+    framework::LoDTensor printed_tensor;
+    printed_tensor.set_lod(in_tensor.lod());
+    printed_tensor.Resize(in_tensor.dims());
+
+    if (platform::is_cpu_place(in_tensor.place())) {
+      printed_tensor.ShareDataWith(in_tensor);
+    } else {
+      // copy data to cpu to print
+      platform::CPUPlace place;
+      framework::Copy(in_tensor, place, &printed_tensor);
+    }
+
+    Formater formater;
+    if (Attr<bool>("print_tensor_name")) {
+      formater.name = printed_var_name;
+    }
+    if (Attr<bool>("print_tensor_type")) {
+      formater.dtype = printed_tensor.type();
+    }
+    if (Attr<bool>("print_tensor_shape")) {
+      auto& dims = printed_tensor.dims();
+      formater.dims.resize(dims.size());
+      for (int i = 0; i < dims.size(); ++i) formater.dims[i] = dims[i];
+    }
+    if (Attr<bool>("print_tensor_lod")) {
+      formater.lod = printed_tensor.lod();
+    }
+    formater.summarize = Attr<int>("summarize");
+    formater.data = (void*)printed_tensor.data<void>();
+    formater(printed_tensor.numel());
+  }
+
+ private:
+  mutable int times_{0};
+};
+
+class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PrintOpProtoAndCheckMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("In", "Input tensor to be displayed.");
+    AddAttr<int>("first_n", "Only log `first_n` number of times.");
+    AddAttr<std::string>("message", "A string message to print as a prefix.");
+    AddAttr<int>("summarize", "Number of elements printed.");
+    AddAttr<bool>("print_tensor_name", "Whether to print the tensor name.");
+    AddAttr<bool>("print_tensor_type", "Whether to print the tensor's dtype.");
+    AddAttr<bool>("print_tensor_shape", "Whether to print the tensor's shape.");
+    AddAttr<bool>("print_tensor_lod", "Whether to print the tensor's lod.");
+    AddAttr<std::string>(
+        "print_phase",
+        "(string, default 'BOTH') Which phase to display including 'FORWARD' "
+        "'BACKWARD' and 'BOTH'.")
+        .SetDefault(kBoth)
+        .InEnum({kForward, kBackward, kBoth});
+    AddOutput("Out", "Output tensor with same data as input tensor.");
+    AddComment(R"DOC(
+Creates a print op that will print when a tensor is accessed.
+
+Wraps the tensor passed in so that whenever that a tensor is accessed,
+the message `message` is printed, along with the current value of the
+tensor `t`.)DOC");
+  }
+};
+
+class InferShapeForward : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("In"), "Input(In) should not be null.");
+    context->ShareLoD("In", /*->*/ "Out");
+    context->SetOutputDim("Out", context->GetInputDim("In"));
+  }
+};
+
+class InferShapeBackward : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("In@GRAD"),
+                   "Input(In@GRAD) should not be null.");
+    context->ShareLoD("In@GRAD", /*->*/ "Out");
+    context->SetOutputDim("Out", context->GetInputDim("In@GRAD"));
+  }
+};
+
+class InferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {}
+};
+
+class PrintOpProtoAndCheckGradOpMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType("print_grad");
+    op_desc_ptr->SetInput("In@GRAD", OutputGrad("Out"));
+    op_desc_ptr->SetOutput("Out", InputGrad("In"));
+    op_desc_ptr->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(print, ops::TensorPrintOp, ops::PrintOpProtoAndCheckMaker,
+                  ops::PrintOpProtoAndCheckGradOpMaker, ops::InferShapeForward,
+                  ops::InferVarType);
+REGISTER_OPERATOR(print_grad, ops::TensorPrintOp, ops::InferShapeBackward);
diff --git a/paddle/operators/proximal_adagrad_op.cc b/paddle/operators/proximal_adagrad_op.cc
index 36e460103ab46bf6f1408840a0699793e2be134d..b92f46b5bd4e48a25f8c87873c5df53f1753b71b 100644
--- a/paddle/operators/proximal_adagrad_op.cc
+++ b/paddle/operators/proximal_adagrad_op.cc
@@ -59,8 +59,7 @@ class ProximalAdagradOp : public framework::OperatorWithKernel {
 
 class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ProximalAdagradOpMaker(framework::OpProto *proto,
-                         framework::OpAttrChecker *op_checker)
+  ProximalAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
@@ -114,4 +113,4 @@ REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad, ops::ProximalAdagradOp,
                              ops::ProximalAdagradOpMaker);
 REGISTER_OP_CPU_KERNEL(
     proximal_adagrad,
-    ops::ProximalAdagradOpKernel<paddle::platform::CPUPlace, float>);
+    ops::ProximalAdagradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/proximal_adagrad_op.cu b/paddle/operators/proximal_adagrad_op.cu
index d0ae0395184ae4f794565f2e28c57f960f0ccbeb..42a178f94b94c8e80ec8f9b5e6471b75878b65d1 100644
--- a/paddle/operators/proximal_adagrad_op.cu
+++ b/paddle/operators/proximal_adagrad_op.cu
@@ -15,6 +15,6 @@ specific language governing permissions and limitations under the License. */
 #include "paddle/operators/proximal_adagrad_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     proximal_adagrad,
-    ops::ProximalAdagradOpKernel<paddle::platform::GPUPlace, float>);
+    ops::ProximalAdagradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/proximal_adagrad_op.h b/paddle/operators/proximal_adagrad_op.h
index 7a1560e8cb339a306ab19513808aab165f82cc8a..523924d80e127d9ad2483e6b239fb948aa72200c 100644
--- a/paddle/operators/proximal_adagrad_op.h
+++ b/paddle/operators/proximal_adagrad_op.h
@@ -24,7 +24,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ProximalAdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -45,20 +45,20 @@ class ProximalAdagradOpKernel : public framework::OpKernel<T> {
 
     auto p_out = EigenVector<T>::Flatten(*param_out);
     auto m_out = EigenVector<T>::Flatten(*moment_out);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<int, 1> grad_dsize(grad->numel());
 
-    m_out.device(place) = m + g * g;
+    m_out.device(*place) = m + g * g;
     auto prox_param = p - lr.broadcast(grad_dsize) * g / m_out.sqrt();
     if (l1 > static_cast<T>(0)) {
-      p_out.device(place) =
+      p_out.device(*place) =
           prox_param.sign() *
           (((prox_param.abs() - (lr * l1).broadcast(grad_dsize))
                 .cwiseMax(static_cast<T>(0.0))) /
            (static_cast<T>(1.0) + (lr * l2).broadcast(grad_dsize)));
     } else {
-      p_out.device(place) =
+      p_out.device(*place) =
           prox_param / (static_cast<T>(1.0) + (lr * l2).broadcast(grad_dsize));
     }
   }
diff --git a/paddle/operators/proximal_gd_op.cc b/paddle/operators/proximal_gd_op.cc
index 5693d0ec9ebf4c470dfa5141b6eeee431f24f2ea..2d3bbdaf320a4d6bdf18ec92230a81ad98371498 100644
--- a/paddle/operators/proximal_gd_op.cc
+++ b/paddle/operators/proximal_gd_op.cc
@@ -47,8 +47,7 @@ class ProximalGDOp : public framework::OperatorWithKernel {
 
 class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ProximalGDOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
+  ProximalGDOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
@@ -94,4 +93,5 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(proximal_gd, ops::ProximalGDOp,
                              ops::ProximalGDOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    proximal_gd, ops::ProximalGDOpKernel<paddle::platform::CPUPlace, float>);
+    proximal_gd,
+    ops::ProximalGDOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/proximal_gd_op.cu b/paddle/operators/proximal_gd_op.cu
index 26f4ebaa0f43620fee7ece2d71755be94a0e01a5..b7dd840d19a13cd3329fb68563693a80d22291ca 100644
--- a/paddle/operators/proximal_gd_op.cu
+++ b/paddle/operators/proximal_gd_op.cu
@@ -15,5 +15,6 @@ specific language governing permissions and limitations under the License. */
 #include "paddle/operators/proximal_gd_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    proximal_gd, ops::ProximalGDOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    proximal_gd,
+    ops::ProximalGDOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/proximal_gd_op.h b/paddle/operators/proximal_gd_op.h
index bebda0204173ec5c3ec9a7a9da6fb623171f4cea..64648b3ccaf9615c995d65464607105d87c04198 100644
--- a/paddle/operators/proximal_gd_op.h
+++ b/paddle/operators/proximal_gd_op.h
@@ -24,7 +24,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ProximalGDOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -42,7 +42,7 @@ class ProximalGDOpKernel : public framework::OpKernel<T> {
     auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
 
     auto p_out = EigenVector<T>::Flatten(*param_out);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<int, 1> grad_dsize(grad->numel());
 
diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index 061e82412ea5f4f17fd26a7094e68b97138cc09c..f2164a0f80519ed9c2490ab3aa6809dc84c6070d 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/rank_loss_op.h"
 
@@ -35,23 +35,30 @@ class RankLossOp : public framework::OperatorWithKernel {
     auto right_dims = ctx->GetInputDim("Right");
 
     PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims),
-                   "All inputs must have the same size");
-    PADDLE_ENFORCE((label_dims.size() == 2) && (label_dims[1] == 1),
-                   "All inputs must be row vector with size batch_size x 1.");
+                   "All inputs must have the same size.");
+    PADDLE_ENFORCE(
+        (label_dims.size() == 2) && (label_dims[1] == 1),
+        "All inputs must be 2-D tensors with shape [batch_size x 1].");
     ctx->SetOutputDim("Out", label_dims);
   }
 };
 
 class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RankLossOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  RankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Label",
-             "The label indicating A ranked higher than B or not, row vector.");
-    AddInput("Left", "The output of RankNet for doc A, vector.");
-    AddInput("Right", "The output of RankNet for doc B, vetor.");
-    AddOutput("Out", "The output loss of RankLoss operator, vector.");
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The label indicating A ranked higher than B or not.");
+    AddInput("Left",
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The output of RankNet for doc A.");
+    AddInput("Right",
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The output of RankNet for doc B.");
+    AddOutput("Out",
+              "(2-D Tensor with shape [batch_size x 1]) "
+              "The output loss of RankLoss operator.");
     AddComment(R"DOC(
 RankLoss Operator.
 
@@ -65,16 +72,17 @@ P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
 the input pair.
 
 The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
-(P_{i,j}), which represent the output of RankNet for the two docs and the label, 
-respectively, and yields the rank loss C_{i,j} using the following equation:
+(P_{i,j}), which represent the output score of RankNet for the two docs and 
+the label respectively, and yields the rank loss C_{i,j} using the following 
+equation:
 
-\f$$
-  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
+$$
+  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
   o_{i,j} =  o_i - o_j  \\
   \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
-\f$$
+$$
 
-The operator can take inputs of one sample or in batch.
+The operator can take batch inputs with size batch_size (batch_size >= 1).
 
 )DOC");
   }
@@ -114,7 +122,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad,
             ops::RankLossGradOp);
-REGISTER_OP_CPU_KERNEL(rank_loss,
-                       ops::RankLossKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    rank_loss_grad, ops::RankLossGradKernel<paddle::platform::CPUPlace, float>);
+    rank_loss, ops::RankLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    rank_loss_grad,
+    ops::RankLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/rank_loss_op.cu b/paddle/operators/rank_loss_op.cu
index 779588ff36c792b8925a535d60f1cfbbe3c66d86..294b22738347b17dd67df05291ac496bfb608323 100644
--- a/paddle/operators/rank_loss_op.cu
+++ b/paddle/operators/rank_loss_op.cu
@@ -1,22 +1,22 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/rank_loss_op.h"
 
-REGISTER_OP_GPU_KERNEL(
-    rank_loss,
-    paddle::operators::RankLossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    rank_loss_grad,
-    paddle::operators::RankLossGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(rank_loss,
+                        paddle::operators::RankLossKernel<
+                            paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(rank_loss_grad,
+                        paddle::operators::RankLossGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h
index f184d6efcb496a1d7f38540712b6c431f816482e..bd0c49ca6e42bcb1a25c53421c0e672cecbb3a15 100644
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class RankLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -35,13 +35,13 @@ class RankLossKernel : public framework::OpKernel<T> {
     auto left = framework::EigenVector<T>::Flatten(*left_t);
     auto right = framework::EigenVector<T>::Flatten(*right_t);
 
-    auto& dev = ctx.GetEigenDevice<Place>();
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
     out.device(dev) =
         (1. + (left - right).exp()).log() - label * (left - right);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class RankLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -55,7 +55,7 @@ class RankLossGradKernel : public framework::OpKernel<T> {
     auto* left_t = ctx.Input<framework::Tensor>("Left");
     auto* right_t = ctx.Input<framework::Tensor>("Right");
 
-    auto& dev = ctx.GetEigenDevice<Place>();
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
     auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
     auto label = framework::EigenVector<T>::Flatten(*label_t);
     auto left = framework::EigenVector<T>::Flatten(*left_t);
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 0075ccd24271bf83f139e121efad00c2316cc11b..a136c5b447d7a64f783c00c928bf9e248aff6649 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <vector>
 #include "paddle/framework/executor.h"
@@ -25,7 +25,7 @@ constexpr char kOutputs[] = "outputs";
 constexpr char kStepScopes[] = "step_scopes";
 constexpr char kExStates[] = "ex_states";
 constexpr char kStates[] = "states";
-constexpr char kStepBlock[] = "step_block";
+constexpr char kStepBlock[] = "sub_block";
 constexpr char kReverse[] = "reverse";
 constexpr char kIsTrain[] = "is_train";
 #define GRAD_SUFFIX "@GRAD"
@@ -227,14 +227,15 @@ class RecurrentOp : public RecurrentBase {
       : RecurrentBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
     VLOG(3) << "Static RNN input sequence length = " << seq_len;
     StepScopes scopes = CreateStepScopes(scope, seq_len);
     auto reverse = Attr<bool>(kReverse);
 
-    framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    framework::Executor executor(place);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
     auto *program = block->Program();
 
     for (size_t i = 0; i < seq_len; ++i) {
@@ -270,6 +271,11 @@ class RecurrentOp : public RecurrentBase {
       executor.Run(*program, &cur_scope, block->ID(),
                    false /*create_local_scope*/);
 
+      // get device context from pool
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
+
       // Copy inside::output -> outside::output
       //    outside::output[seq_offset: seq_offset + 1] = inside::output
       this->LinkTensorWithCallback(
@@ -278,13 +284,13 @@ class RecurrentOp : public RecurrentBase {
               framework::LoDTensor *dst_tensor) {
             if (i == 0) {  // create output tensor at begin
               dst_tensor->Resize(PrependDims(seq_len, src_tensor.dims()));
-              dst_tensor->mutable_data(dev_ctx.GetPlace(), src_tensor.type());
+              dst_tensor->mutable_data(place, src_tensor.type());
             }
 
             auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
             // Explicit copy output since the local RNN scope can be destroyed
             // early.
-            dst_out.CopyFrom(src_tensor, dev_ctx.GetPlace(), dev_ctx);
+            framework::Copy(src_tensor, place, dev_ctx, &dst_out);
           });
 
       scopes.Next();
@@ -310,15 +316,20 @@ class RecurrentGradOp : public RecurrentBase {
       : RecurrentBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto seq_len = static_cast<size_t>(GetSequenceLength(scope));
     StepScopes scopes = CreateStepScopes(scope, seq_len);
     auto reverse = Attr<bool>(kReverse);
 
-    framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    framework::Executor executor(place);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
     auto *program = block->Program();
 
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
     for (size_t step_id = 0; step_id < seq_len; ++step_id) {
       size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
       VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
@@ -365,7 +376,7 @@ class RecurrentGradOp : public RecurrentBase {
           auto *cur_grad_var = cur_scope.Var(cur_grad);
           auto cur_grad_tensor =
               cur_grad_var->GetMutable<framework::LoDTensor>();
-          cur_grad_tensor->CopyFrom(ex_tensor, dev_ctx.GetPlace(), dev_ctx);
+          framework::Copy(ex_tensor, place, dev_ctx, cur_grad_tensor);
         }
       }
 
@@ -401,13 +412,14 @@ class RecurrentGradOp : public RecurrentBase {
             auto &inside_tensor = cur_scope.FindVar(inside_grad_name)
                                       ->Get<framework::LoDTensor>();
             framework::AttributeMap attrs;
-            attrs["data_type"] = framework::ToDataType(inside_tensor.type());
+            attrs["dtype"] = framework::ToDataType(inside_tensor.type());
             attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
             attrs["value"] = 0.0f;
 
             auto zero_op = framework::OpRegistry::CreateOp(
-                "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs);
-            zero_op->Run(scope, dev_ctx);
+                "fill_constant", framework::VariableNameMap{},
+                {{"Out", {pg_names[param_id]}}}, attrs);
+            zero_op->Run(scope, place);
           }
 
           auto new_inside_name = cur_scope.Rename(inside_grad_name);
@@ -415,8 +427,8 @@ class RecurrentGradOp : public RecurrentBase {
 
           auto sum_op = framework::OpRegistry::CreateOp(
               "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-              {{"Out", {pg_names[param_id]}}}, {});
-          sum_op->Run(cur_scope, dev_ctx);
+              {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+          sum_op->Run(cur_scope, place);
 
           cur_scope.Rename(new_inside_name, inside_grad_name);
         }
@@ -434,11 +446,11 @@ class RecurrentGradOp : public RecurrentBase {
             }
             if (step_id == 0) {  // alloc memory
               outside->Resize(PrependDims(seq_len, inside.dims()));
-              outside->mutable_data(dev_ctx.GetPlace(), inside.type());
+              outside->mutable_data(place, inside.type());
             }
 
             auto dst = outside->Slice(seq_offset, seq_offset + 1);
-            dst.CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx);
+            framework::Copy(inside, place, dev_ctx, &dst);
           });
       VLOG(5) << "Link outside gradient finished ";
 
@@ -450,8 +462,8 @@ class RecurrentGradOp : public RecurrentBase {
             [&](const framework::LoDTensor &inside,
                 framework::LoDTensor *outside) {
               outside->Resize(inside.dims());
-              outside->mutable_data(dev_ctx.GetPlace(), inside.type());
-              outside->CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx);
+              outside->mutable_data(place, inside.type());
+              framework::Copy(inside, place, dev_ctx, outside);
             });
         VLOG(5) << "Link initialize state gradient finished ";
       }
@@ -480,7 +492,7 @@ class RecurrentGradOp : public RecurrentBase {
 
   std::unordered_set<std::string> LocalVarNames(
       const framework::Scope &scope) const {
-    return this->List2Set(scope.GetAllNames(false));
+    return this->List2Set(scope.LocalVarNames());
   }
   static std::vector<std::string> GradVarLists(
       const std::vector<std::string> &var_names) {
@@ -494,8 +506,7 @@ class RecurrentGradOp : public RecurrentBase {
 
 class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RecurrentOpProtoMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
+  RecurrentOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(kInputs, "rnn inputs").AsDuplicable();
     AddInput(kInitialStates, "rnn initial states").AsDuplicable();
@@ -520,8 +531,7 @@ The ex-state means the state value in the ex-timestep or the previous time step
         string::Sprintf(
             "The state variable names. [%s, %s, %s] must be the same order",
             kExStates, kStates, kInitStateGrads));
-    AddAttr<framework::BlockDescBind *>(kStepBlock,
-                                        "The step block inside RNN");
+    AddAttr<framework::BlockDesc *>(kStepBlock, "The step block inside RNN");
     AddAttr<bool>(kReverse, R"DOC(Calculate RNN reversely or not.
 By default reverse=False
 
@@ -563,13 +573,13 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  virtual std::unique_ptr<framework::OpDescBind> Apply() const {
-    auto *grad = new framework::OpDescBind();
+  virtual std::unique_ptr<framework::OpDesc> Apply() const {
+    auto *grad = new framework::OpDesc();
     grad->SetType("recurrent_grad");
     for (auto &input_param : this->InputNames()) {
       grad->SetInput(input_param, this->Input(input_param));
       grad->SetOutput(framework::GradVarName(input_param),
-                      this->InputGrad(input_param));
+                      this->InputGrad(input_param, false));
     }
 
     for (auto &output_param : this->OutputNames()) {
@@ -586,7 +596,7 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
     grad->SetAttrMap(this->Attrs());
     grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
 
-    return std::unique_ptr<framework::OpDescBind>(grad);
+    return std::unique_ptr<framework::OpDesc>(grad);
   }
 };
 
@@ -597,7 +607,9 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
     std::vector<std::string> output{kOutputs};
     for (auto &s : input) {
       PADDLE_ENFORCE(ctx->HasInputs(s));
-      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)));
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)),
+                     "Cannot find the gradient variable %s",
+                     framework::GradVarName(s));
     }
     for (auto &s : output) {
       PADDLE_ENFORCE(ctx->HasInputs(s));
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9ed7516826319da422fbb0af4e5c277afa7ae40
--- /dev/null
+++ b/paddle/operators/recv_op.cc
@@ -0,0 +1,213 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <ostream>
+#include <thread>
+
+#include <unistd.h>
+
+#include "paddle/framework/executor.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/proto_desc.h"
+#include "paddle/operators/detail/grpc_server.h"
+#include "paddle/operators/detail/sendrecvop_utils.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
+
+namespace paddle {
+namespace operators {
+
+void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
+  service->RunSyncUpdate();
+  VLOG(4) << "RunServer thread end";
+}
+
+static void CreateTensorFromMessageType(framework::Variable *var,
+                                        sendrecv::VarType var_type) {
+  if (var_type == sendrecv::VarType::LOD_TENSOR) {
+    var->GetMutable<framework::LoDTensor>();
+  } else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
+    var->GetMutable<framework::SelectedRows>();
+  } else {
+    PADDLE_THROW(
+        "VraibleMessage type %d is not in "
+        "[LoDTensor, SelectedRows]",
+        var_type);
+  }
+}
+
+class RecvOp : public framework::OperatorBase {
+ public:
+  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    if (!rpc_service_) {
+      std::string endpoint = Attr<std::string>("endpoint");
+      rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+      server_thread_.reset(new std::thread(RunServer, rpc_service_));
+    }
+  }
+
+  void Stop() override {
+    detail::MessageWithName term_msg;
+    term_msg.first = LISTEN_TERMINATE_MESSAGE;
+    rpc_service_->Push(term_msg);
+    rpc_service_->ShutDown();
+    server_thread_->join();
+  }
+
+  std::string GetGradVarNameForTrainer(const std::string &varname) const {
+    if (grads_counter_.find(varname) == grads_counter_.end()) {
+      grads_counter_[varname] = 0;
+    }
+    char ret[256];
+    snprintf(ret, sizeof(ret), "%s.trainer_%d", varname.c_str(),
+             grads_counter_[varname]++);
+    return std::string(ret);
+  }
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    // FIXME(typhoonzero): no new scopes for every run.
+    framework::Scope &recv_scope = scope.NewScope();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+
+    // FIXME(Yancey1989): initialize rpc server with laze mode.
+    rpc_service_->SetScope(&recv_scope);
+    rpc_service_->SetDevCtx(&dev_ctx);
+    auto param_list = Attr<std::vector<std::string>>("ParamList");
+    auto grad_list = Attr<std::vector<std::string>>("GradList");
+    auto trainer_count = Attr<int>("Trainers");
+    size_t param_count = param_list.size();
+
+    rpc_service_->Reset();
+    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
+    bool exit_flag = false;
+    VLOG(4) << "param_count:" << param_count
+            << " trainer_count:" << trainer_count;
+    while (!exit_flag) {
+      // TODO(gognwb): simply this loop.
+      // Get from multiple trainers, we don't care about order in which
+      // the gradient arrives, just add suffix 0~n then average the gradient.
+      for (size_t i = 0; i < param_count * trainer_count; ++i) {
+        // blocking get one var from client.
+        const detail::MessageWithName &v = rpc_service_->Get();
+        auto grad_var_name = v.first;
+        if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
+          VLOG(4) << "received LISTEN_TERMINATE_MESSAGE and RunOp.Run() exit";
+          exit_flag = true;
+          break;
+        }
+        auto it = std::find(grad_list.begin(), grad_list.end(), grad_var_name);
+        std::string param_var_name;
+        if (it != grad_list.end()) {
+          param_var_name = param_list[it - grad_list.begin()];
+        } else {
+          LOG(ERROR) << "grad have no paired param found!\"" << grad_var_name
+                     << "\"";
+        }
+        VLOG(3) << "recved grad: " << grad_var_name
+                << " updating param: " << param_var_name;
+
+        auto *merged_grad = recv_scope.FindVar(grad_var_name);
+        if (merged_grad == nullptr) {
+          auto *ptr = recv_scope.Var(grad_var_name);
+          CreateTensorFromMessageType(ptr, v.second.type());
+          VLOG(3) << "Create Variable " << grad_var_name
+                  << " on recv scope, which pointer is " << ptr << " type is "
+                  << v.second.type();
+        }
+
+        if (trainer_count > 1) {
+          grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
+        }
+
+        auto *var = recv_scope.Var(grad_var_name);
+        detail::DeserializeFromMessage(v.second, dev_ctx, var);
+      }
+
+      if (exit_flag) {
+        break;
+      }
+
+      rpc_service_->Reset();
+
+      std::string program_str = Attr<std::string>("OptimizeProgram");
+      framework::proto::ProgramDesc program_desc;
+      program_desc.ParseFromString(program_str);
+      framework::ProgramDesc program(program_desc);
+      framework::Executor executor(dev_place);
+      // Run sub graph to get optimized tensor
+      try {
+        executor.Run(program, &recv_scope, 0, /*global_block*/
+                     false /*create_local_scope*/, false /*create_vars*/);
+      } catch (std::exception &e) {
+        LOG(ERROR) << "run sub program error " << e.what();
+      }
+
+      rpc_service_->Done();
+      grads_counter_.clear();
+    }  // while(true)
+  }
+
+ protected:
+  std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
+  std::shared_ptr<std::thread> server_thread_;
+  mutable std::unordered_map<std::string, int> grads_counter_;
+};
+
+class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("RX", "(Tensor) Input tensor to be optimized").AsDuplicable();
+    AddComment(R"DOC(
+Recv operator
+
+This operator will recv tensor from send_op
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+    AddAttr<std::string>("OptimizeProgram", "type string",
+                         "Serialized ProgramDesc string for recv to run.");
+    AddAttr<std::vector<std::string>>(
+        "ParamList", "type list of string",
+        "grad->param name mapping to find which param to optimize.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        "GradList", "type list of string",
+        "grad->param name mapping to find which param to optimize.")
+        .SetDefault({});
+    AddAttr<int>("Trainers", "type int",
+                 "Number of trainers in the current cluster job")
+        .SetDefault(1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker);
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
index 2589a54cfc7fc5bc11ae983797d480a134e0eb25..172d28bb3b647901d4de7bc03c9de21e3468a364 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/reduce_op.h"
 #include "paddle/operators/net_op.h"
@@ -37,18 +37,23 @@ class ReduceOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_LT(
         dim, x_rank,
         "The dim should be in the range [-rank(input), rank(input)).");
-    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
-    auto dims_vector = vectorize(x_dims);
-    if (keep_dim || x_rank == 1) {
-      dims_vector[dim] = 1;
+    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    if (reduce_all) {
+      ctx->SetOutputDim("Out", {1});
     } else {
-      dims_vector.erase(dims_vector.begin() + dim);
-    }
-    auto out_dims = framework::make_ddim(dims_vector);
-    ctx->SetOutputDim("Out", out_dims);
-    if (dim != 0) {
-      // Only pass LoD when not reducing on the first dim.
-      ctx->ShareLoD("X", /*->*/ "Out");
+      bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
+      auto dims_vector = vectorize(x_dims);
+      if (keep_dim || x_rank == 1) {
+        dims_vector[dim] = 1;
+      } else {
+        dims_vector.erase(dims_vector.begin() + dim);
+      }
+      auto out_dims = framework::make_ddim(dims_vector);
+      ctx->SetOutputDim("Out", out_dims);
+      if (dim != 0) {
+        // Only pass LoD when not reducing on the first dim.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
     }
   }
 };
@@ -72,13 +77,14 @@ class ReduceGradOp : public framework::OperatorWithKernel {
     auto x_grad_name = framework::GradVarName("X");
     if (ctx->HasOutput(x_grad_name)) {
       ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
     }
   }
 };
 
 class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReduceOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor) The input tensor. Tensors with rank at most 6 are "
@@ -95,11 +101,16 @@ class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false) "
                   "If true, retain the reduced dimension with length 1.")
         .SetDefault(false);
+    AddAttr<bool>("reduce_all",
+                  "(bool, default false) "
+                  "If true, output a scalar reduced along all dimensions.")
+        .SetDefault(false);
     comment_ = R"DOC(
 {ReduceOp} Operator.
 
 This operator computes the {reduce} of input tensor along the given dimension. 
 The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+If reduce_all is true, just reduce along all dimensions and output a scalar.
 
 )DOC";
     AddComment(comment_);
@@ -125,8 +136,7 @@ The result tensor has 1 fewer dimension than the input unless keep_dim is true.
 
 class ReduceSumOpMaker : public ReduceOpMaker {
  public:
-  ReduceSumOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ReduceSumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : ReduceOpMaker(proto, op_checker) {
     SetComment("ReduceSum", "sum");
     AddComment(comment_);
@@ -135,8 +145,7 @@ class ReduceSumOpMaker : public ReduceOpMaker {
 
 class ReduceMeanOpMaker : public ReduceOpMaker {
  public:
-  ReduceMeanOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
+  ReduceMeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : ReduceOpMaker(proto, op_checker) {
     SetComment("ReduceMean", "mean");
     AddComment(comment_);
@@ -145,8 +154,7 @@ class ReduceMeanOpMaker : public ReduceOpMaker {
 
 class ReduceMaxOpMaker : public ReduceOpMaker {
  public:
-  ReduceMaxOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ReduceMaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : ReduceOpMaker(proto, op_checker) {
     SetComment("ReduceMax", "max");
     AddComment(comment_);
@@ -155,8 +163,7 @@ class ReduceMaxOpMaker : public ReduceOpMaker {
 
 class ReduceMinOpMaker : public ReduceOpMaker {
  public:
-  ReduceMinOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ReduceMinOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : ReduceOpMaker(proto, op_checker) {
     SetComment("ReduceMin", "min");
     AddComment(comment_);
@@ -180,12 +187,13 @@ REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad,
 REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad,
             ops::ReduceGradOp);
 
-#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)     \
-  REGISTER_OP_CPU_KERNEL(                                                  \
-      reduce_type,                                                         \
-      ops::ReduceKernel<paddle::platform::CPUPlace, float, ops::functor>); \
-  REGISTER_OP_CPU_KERNEL(reduce_type##_grad,                               \
-                         ops::ReduceGradKernel<paddle::platform::CPUPlace, \
-                                               float, ops::grad_functor>);
+#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
+  REGISTER_OP_CPU_KERNEL(reduce_type,                                          \
+                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
+                                           float, ops::functor>);              \
+  REGISTER_OP_CPU_KERNEL(                                                      \
+      reduce_type##_grad,                                                      \
+      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,         \
+                            ops::grad_functor>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL);
diff --git a/paddle/operators/reduce_op.cu b/paddle/operators/reduce_op.cu
index d306e1a24096d737438d71d4d4abc35328d160cb..1dd948ed8a79cce8468f2fe210b5636e7dd1f99e 100644
--- a/paddle/operators/reduce_op.cu
+++ b/paddle/operators/reduce_op.cu
@@ -1,28 +1,29 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/reduce_op.h"
 
 namespace ops = paddle::operators;
 
-#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)     \
-  REGISTER_OP_GPU_KERNEL(                                                  \
-      reduce_type,                                                         \
-      ops::ReduceKernel<paddle::platform::GPUPlace, float, ops::functor>); \
-  REGISTER_OP_GPU_KERNEL(reduce_type##_grad,                               \
-                         ops::ReduceGradKernel<paddle::platform::GPUPlace, \
-                                               float, ops::grad_functor>);
+#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)    \
+  REGISTER_OP_CUDA_KERNEL(                                                \
+      reduce_type, ops::ReduceKernel<paddle::platform::CUDADeviceContext, \
+                                     float, ops::functor>);               \
+  REGISTER_OP_CUDA_KERNEL(                                                \
+      reduce_type##_grad,                                                 \
+      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,   \
+                            ops::grad_functor>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL);
diff --git a/paddle/operators/reduce_op.h b/paddle/operators/reduce_op.h
index dd6547542d16b0fe336184a0c09a8498027db6ea..da5f3977769990a45c94db21f5dbd01ac70ac06e 100644
--- a/paddle/operators/reduce_op.h
+++ b/paddle/operators/reduce_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -26,61 +26,63 @@ using DDim = framework::DDim;
 template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 struct SumFunctor {
-  template <typename Place, typename X, typename Y, typename Dim>
-  void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
     y.device(place) = x.sum(dim);
   }
 };
 
 struct SumGradFunctor {
-  template <typename Place, typename X, typename Y, typename DX, typename DY,
-            typename Dim>
-  void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy,
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
                   const Dim& dim, int size) {
     dx.device(place) = dy.broadcast(dim);
   }
 };
 
 struct MeanFunctor {
-  template <typename Place, typename X, typename Y, typename Dim>
-  void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
     y.device(place) = x.mean(dim);
   }
 };
 
 struct MeanGradFunctor {
-  template <typename Place, typename X, typename Y, typename DX, typename DY,
-            typename Dim>
-  void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy,
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
                   const Dim& dim, int size) {
     dx.device(place) = dy.broadcast(dim) / dx.constant(size);
   }
 };
 
 struct MaxFunctor {
-  template <typename Place, typename X, typename Y, typename Dim>
-  void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
     y.device(place) = x.maximum(dim);
   }
 };
 
 struct MinFunctor {
-  template <typename Place, typename X, typename Y, typename Dim>
-  void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
     y.device(place) = x.minimum(dim);
   }
 };
 
 struct MaxOrMinGradFunctor {
-  template <typename Place, typename X, typename Y, typename DX, typename DY,
-            typename Dim>
-  void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy,
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
                   const Dim& dim, int size) {
     auto equals = x == y.broadcast(dim);
     auto ones = dx.constant(1);
@@ -91,30 +93,45 @@ struct MaxOrMinGradFunctor {
   }
 };
 
-template <typename Place, typename T, typename Functor>
+template <typename DeviceContext, typename T, typename Functor>
 class ReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    int rank = context.Input<Tensor>("X")->dims().size();
-    switch (rank) {
-      case 1:
-        ReduceCompute<1>(context);
-        break;
-      case 2:
-        ReduceCompute<2>(context);
-        break;
-      case 3:
-        ReduceCompute<3>(context);
-        break;
-      case 4:
-        ReduceCompute<4>(context);
-        break;
-      case 5:
-        ReduceCompute<5>(context);
-        break;
-      case 6:
-        ReduceCompute<6>(context);
-        break;
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    if (reduce_all) {
+      // Flatten and reduce 1-D tensor
+      auto* input = context.Input<Tensor>("X");
+      auto* output = context.Output<Tensor>("Out");
+      output->mutable_data<T>(context.GetPlace());
+      auto x = EigenVector<T>::Flatten(*input);
+      auto out = EigenScalar<T>::From(*output);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      auto reduce_dim = Eigen::array<int, 1>({{0}});
+      Functor functor;
+      functor(place, x, out, reduce_dim);
+    } else {
+      int rank = context.Input<Tensor>("X")->dims().size();
+      switch (rank) {
+        case 1:
+          ReduceCompute<1>(context);
+          break;
+        case 2:
+          ReduceCompute<2>(context);
+          break;
+        case 3:
+          ReduceCompute<3>(context);
+          break;
+        case 4:
+          ReduceCompute<4>(context);
+          break;
+        case 5:
+          ReduceCompute<5>(context);
+          break;
+        case 6:
+          ReduceCompute<6>(context);
+          break;
+      }
     }
   }
 
@@ -139,7 +156,8 @@ class ReduceKernel : public framework::OpKernel<T> {
       dims = framework::make_ddim(dims_vector);
     }
 
-    auto& place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
 
     if (D == 1) {
@@ -152,30 +170,50 @@ class ReduceKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T, typename Functor>
+template <typename DeviceContext, typename T, typename Functor>
 class ReduceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    int rank = context.Input<Tensor>("X")->dims().size();
-    switch (rank) {
-      case 1:
-        ReduceGradCompute<1>(context);
-        break;
-      case 2:
-        ReduceGradCompute<2>(context);
-        break;
-      case 3:
-        ReduceGradCompute<3>(context);
-        break;
-      case 4:
-        ReduceGradCompute<4>(context);
-        break;
-      case 5:
-        ReduceGradCompute<5>(context);
-        break;
-      case 6:
-        ReduceGradCompute<6>(context);
-        break;
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    if (reduce_all) {
+      auto* input0 = context.Input<Tensor>("X");
+      auto* input1 = context.Input<Tensor>("Out");
+      auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+      output->mutable_data<T>(context.GetPlace());
+      auto x = EigenVector<T>::Flatten(*input0);
+      auto x_reduce = EigenVector<T>::From(*input1);
+      auto x_reduce_grad = EigenVector<T>::From(*input2);
+      auto x_grad = EigenVector<T>::Flatten(*output);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      auto broadcast_dim =
+          Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
+      Functor functor;
+      functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
+              broadcast_dim[0]);
+    } else {
+      int rank = context.Input<Tensor>("X")->dims().size();
+      switch (rank) {
+        case 1:
+          ReduceGradCompute<1>(context);
+          break;
+        case 2:
+          ReduceGradCompute<2>(context);
+          break;
+        case 3:
+          ReduceGradCompute<3>(context);
+          break;
+        case 4:
+          ReduceGradCompute<4>(context);
+          break;
+        case 5:
+          ReduceGradCompute<5>(context);
+          break;
+        case 6:
+          ReduceGradCompute<6>(context);
+          break;
+      }
     }
   }
 
@@ -201,7 +239,8 @@ class ReduceGradKernel : public framework::OpKernel<T> {
     Eigen::array<int, D> broadcast_dim;
     for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
     broadcast_dim[dim] = input0->dims()[dim];
-    auto& place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
     functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
             broadcast_dim[dim]);
diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c30447949421da516213b47178828453671c693
--- /dev/null
+++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
@@ -0,0 +1,270 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+class ReorderLoDTensorByRankTableOpProtoMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReorderLoDTensorByRankTableOpProtoMaker(OpProto *proto,
+                                          OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor), the input lod tensor to be reordered according to "
+             "Input(RankTable).");
+    AddInput("RankTable",
+             "(LoDRankTable), the rank table according to which Input(X) is "
+             "reordered.");
+    AddOutput("Out", "(LoDTensor), the reordered lod tensor.");
+    AddComment(R"DOC(ReorderLoDTensorByRankTable operator.
+
+Input(X) is a batch of sequences. Input(RankTable) stores new orders of the
+input sequence batch. The reorder_lod_tensor_by_rank operator reorders the
+Input(X) according to the information provided by Input(RankTable).
+
+For example:
+
+If the indices stored in the Input(RankTable) are [3, 0, 2, 1], the
+Input(X) will be reordered that the fourth sequence in Input(X) will become the
+first one, and then followed by the original first, third, and the second one.
+
+This is:
+X = [Seq0, Seq1, Seq2, Seq3]. The indices in RankTable are [3, 0, 2, 1].
+Out =  [Seq3, Seq0, Seq2, Seq1] with a new LoD information.
+
+If the LoD information of Input(X) is empty, this means Input(X) is not sequence
+data. This is also identical to a batch of sequences where each sequence has a
+fixed length 1. In this case, the reorder_lod_tensor_by_rank operator reorders
+each slice of Input(X) along the first axis according to Input(RankTable).
+
+This is:
+X = [Slice0, Slice1, Slice2, Slice3] and its LoD information is empty. The
+indices in RankTable are [3, 0, 2, 1].
+Out = [Slice3, Slice0, Slice2, Slice1] with no LoD information is appended.
+
+NOTE: This operator sorts Input(X) according to a given LoDRankTable which does
+not need to be calculated according to Input(X). It can be calculated according
+to another different sequence, and then this operator sorts Input(X) according
+to the given LoDRankTable.
+
+)DOC");
+  }
+};
+
+class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
+ public:
+  ReorderLoDTensorByRankTableBase(const std::string &type,
+                                  const framework::VariableNameMap &inputs,
+                                  const framework::VariableNameMap &outputs,
+                                  const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto &x =
+        detail::Ref(scope.FindVar(Input("X")),
+                    "Cannot find input lod tensor variable %s", Input("X"))
+            .Get<framework::LoDTensor>();
+    auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")),
+                                   "Cannot find input rank table variable %s",
+                                   Input("RankTable"))
+                           .Get<framework::LoDRankTable>();
+    auto &out =
+        *detail::Ref(scope.FindVar(Output("Out")),
+                     "Cannot find output lod tensor variable %s", Output("Out"))
+             .GetMutable<framework::LoDTensor>();
+
+    out.Resize(x.dims());
+    out.mutable_data(x.place(), x.type());
+    this->process(place, x, rank_table, &out);
+  }
+
+ protected:
+  virtual void process(const platform::Place &place,
+                       const framework::LoDTensor &x,
+                       const framework::LoDRankTable &rank_table,
+                       framework::LoDTensor *out) const = 0;
+
+  struct AbsoluteRankTableItem {
+    size_t offset;  // the absolute/accumulated offset.
+    size_t length;  // the length
+    framework::LoD lod;
+  };
+
+  std::vector<AbsoluteRankTableItem> GetAbsoluteOffsetAndLengthByLoDRankTable(
+      const framework::LoDTensor &x) const {
+    std::vector<AbsoluteRankTableItem> absolute_table;
+
+    if (x.lod().empty()) {
+      // For Tensor without lod, such as the output of sequence_pool_op
+      size_t size = x.dims()[0];
+      absolute_table.reserve(size);
+      for (size_t i = 0; i < size; ++i) {
+        absolute_table.emplace_back();
+        absolute_table.back().length = 1;
+        absolute_table.back().offset = i;
+      }
+    } else {
+      size_t level = 0;
+      size_t size = x.lod()[level].size();
+
+      for (size_t i = 0; i < size - 1; ++i) {
+        auto lod_offset =
+            framework::GetSubLoDAndAbsoluteOffset(x.lod(), i, i + 1, level);
+
+        auto &offset = lod_offset.second;
+
+        absolute_table.emplace_back();
+        absolute_table.back().length = offset.second - offset.first;
+        absolute_table.back().offset = offset.first;
+        absolute_table.back().lod = lod_offset.first;
+      }
+    }
+
+    return absolute_table;
+  }
+
+  size_t CopyTensorAndLod(const platform::Place &place,
+                          const AbsoluteRankTableItem &item,
+                          const framework::LoDTensor &x,
+                          framework::LoDTensor *out, size_t out_offset) const {
+    auto &out_lod = *out->mutable_lod();
+    auto len = item.length;
+    auto x_offset = item.offset;
+
+    if (out_lod.empty()) {
+      for (size_t i = 0; i < item.lod.size(); ++i) {
+        out_lod.push_back(std::vector<size_t>({0}));
+      }
+    }
+
+    for (size_t i = 0; i < out_lod.size(); ++i) {
+      auto &out_v = out_lod[i];
+      auto &new_lod_v = item.lod[i];
+
+      for (auto &detail : new_lod_v) {
+        out_v.push_back(out_v.back() + detail);
+      }
+    }
+
+    auto x_sliced = x.Slice(x_offset, x_offset + len);
+    auto out_sliced = out->Slice(out_offset, out_offset + len);
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    framework::Copy(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
+    out_offset += len;
+    return out_offset;
+  }
+};
+
+class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase {
+ public:
+  ReorderLoDTensorByRankTableOp(const std::string &type,
+                                const framework::VariableNameMap &inputs,
+                                const framework::VariableNameMap &outputs,
+                                const framework::AttributeMap &attrs)
+      : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  void process(const platform::Place &place, const framework::LoDTensor &x,
+               const framework::LoDRankTable &rank_table,
+               framework::LoDTensor *out) const override {
+    auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
+    size_t out_offset = 0;
+    out->mutable_lod()->clear();
+    for (auto &item : rank_table.items()) {
+      PADDLE_ENFORCE_LT(item.index, absolute_table.size());
+      out_offset = CopyTensorAndLod(place, absolute_table[item.index], x, out,
+                                    out_offset);
+    }
+  }
+};
+
+class IdentityInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+};
+
+class ReorderLodTensorByRankGradOpMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("reorder_lod_tensor_by_rank_grad");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetInput("RankTable", Input("RankTable"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class ReorderLoDTensorByRankGradOp : public ReorderLoDTensorByRankTableBase {
+ public:
+  ReorderLoDTensorByRankGradOp(const std::string &type,
+                               const framework::VariableNameMap &inputs,
+                               const framework::VariableNameMap &outputs,
+                               const framework::AttributeMap &attrs)
+      : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  void process(const platform::Place &place, const framework::LoDTensor &x,
+               const framework::LoDRankTable &rank_table,
+               framework::LoDTensor *out) const override {
+    auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
+
+    // offsets = enumerate([item.index for item in rank_table.items()])
+    std::vector<std::pair<size_t, size_t>> offsets;
+    offsets.reserve(rank_table.items().size());
+    for (size_t i = 0; i < rank_table.items().size(); ++i) {
+      offsets.push_back({i, rank_table.items()[i].index});
+    }
+
+    // offsets.sort(key=lambda x: x[1])
+    std::sort(
+        offsets.begin(), offsets.end(),
+        [](const std::pair<size_t, size_t> &a,
+           const std::pair<size_t, size_t> &b) { return a.second < b.second; });
+
+    // Copy TensorAndLod
+    size_t out_offset = 0;
+    for (auto &offset : offsets) {
+      out_offset = this->CopyTensorAndLod(place, absolute_table[offset.first],
+                                          x, out, out_offset);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(reorder_lod_tensor_by_rank,
+                  ops::ReorderLoDTensorByRankTableOp,
+                  ops::ReorderLodTensorByRankGradOpMaker,
+                  ops::ReorderLoDTensorByRankTableOpProtoMaker,
+                  ops::IdentityInferShape);
+REGISTER_OPERATOR(reorder_lod_tensor_by_rank_grad,
+                  ops::ReorderLoDTensorByRankGradOp, ops::IdentityInferShape);
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index ba774ec2160c0460867de42f7ad9d5cd65ad8d6a..58e8fd6124d8c076337ae9bb2f5103e7a3cb7ff0 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -1,17 +1,16 @@
-
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/reshape_op.h"
 
@@ -35,21 +34,33 @@ class ReshapeOp : public framework::OperatorWithKernel {
     auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
     PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
     auto x_dims = ctx->GetInputDim("X");
-    // TODO(qiao) change batch_size
-    for (size_t i = 1; i < shape.size(); ++i) {
-      PADDLE_ENFORCE(shape[i] > 0,
-                     "Each dimension of shape "
-                     "must be positiv except the first.");
-    }
-    if (shape[0] < 0) {
-      shape[0] = x_dims[0];
+
+    std::vector<size_t> neg_dims_idx;
+    // set some dimension to -1 if it is unknown
+    const int unknown_size = -1;
+    for (size_t i = 0; i < shape.size(); ++i) {
+      PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size,
+                     "Each dimension of Attr(shape) must be positive or %d.",
+                     unknown_size);
+      if (shape[i] == unknown_size) {
+        neg_dims_idx.push_back(i);
+        PADDLE_ENFORCE(neg_dims_idx.size() <= 1,
+                       "Only one dimension of Attr(shape) can be unknown.");
+      }
     }
-    // capacity check
+
     int64_t capacity =
         std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
     int64_t in_size = framework::product(x_dims);
-    PADDLE_ENFORCE_EQ(capacity, in_size,
-                      "The size of Input(X) mismatches with Attr(shape).");
+    if (neg_dims_idx.size() == 1) {
+      // dim infer
+      shape[neg_dims_idx[0]] = in_size / (-capacity);
+      // recalculate capacity
+      capacity = shape[neg_dims_idx[0]] * (-capacity);
+    }
+    // capacity check
+    PADDLE_ENFORCE(capacity == in_size,
+                   "The size of Input(X) mismatches with Attr(shape).");
     // resize output
     std::vector<int64_t> shape_int64(shape.size(), 0);
     std::transform(shape.begin(), shape.end(), shape_int64.begin(),
@@ -66,8 +77,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
 
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReshapeOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of reshape operator.");
     AddOutput("Out", "The output tensor of reshape operator.");
@@ -85,10 +95,13 @@ Given a 2-D tensor X with 2 rows and 2 columns
     [[1, 2], [3, 4]]
 
 and target shape = [1, 4], the reshape operator will transform
-the tensor X into a 1-D tensor:
+the tensor X into a 2-D tensor:
 
-    [1, 2, 3, 4]
+    [[1, 2, 3, 4]]
 
+One dimension in the target shape can be set -1, representing that its
+size is unknown. In this case, the real dimension will be infered from 
+the original shape of Input(X) and other dimensions in the target shape.
 )DOC");
   }
 };
diff --git a/paddle/operators/reshape_op.cu b/paddle/operators/reshape_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f487e43b99d5be2af299a9edd91dcda0c4eb7b99
--- /dev/null
+++ b/paddle/operators/reshape_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/reshape_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reshape,
+    paddle::operators::ReshapeKernel<paddle::platform::CUDAPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    reshape_grad,
+    paddle::operators::ReshapeGradKernel<paddle::platform::CUDAPlace, float>);
diff --git a/paddle/operators/reshape_op.cu.cc b/paddle/operators/reshape_op.cu.cc
deleted file mode 100644
index 23dbe089d3b37aabedf9ef166f7bbfbf67da7e0a..0000000000000000000000000000000000000000
--- a/paddle/operators/reshape_op.cu.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/reshape_op.h"
-
-REGISTER_OP_GPU_KERNEL(
-    reshape,
-    paddle::operators::ReshapeKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    reshape_grad,
-    paddle::operators::ReshapeGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h
index beb951713ae2a9fd83fe7c1a5e97ee8c642158a8..d884b03cadbf32de2c44e1aaed52fee9304534eb 100644
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ReshapeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -28,12 +28,12 @@ class ReshapeKernel : public framework::OpKernel<T> {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto out_dims = out->dims();
     out->mutable_data<T>(ctx.GetPlace());
-    out->CopyFrom(*in, ctx.GetPlace(), ctx.device_context());
+    framework::Copy(*in, ctx.GetPlace(), ctx.device_context(), out);
     out->Resize(out_dims);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ReshapeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -42,7 +42,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
     d_x->mutable_data<T>(ctx.GetPlace());
 
     auto in_dims = d_x->dims();
-    d_x->CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context());
+    framework::Copy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
     d_x->Resize(in_dims);
   }
 };
diff --git a/paddle/operators/rmsprop_op.cc b/paddle/operators/rmsprop_op.cc
index a9c45f639c6728ff2fd6de6fcdadfe5032a705d7..f7c250bf913b9213e7d7e2cca9ecadf74cac91a1 100644
--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
@@ -63,8 +63,7 @@ class RmspropOp : public framework::OperatorWithKernel {
 
 class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RmspropOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  RmspropOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
@@ -116,5 +115,5 @@ http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
-REGISTER_OP_CPU_KERNEL(rmsprop,
-                       ops::RmspropOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/rmsprop_op.cu b/paddle/operators/rmsprop_op.cu
index 52634a54816bcd5ad0ba82a56f1df95110112265..0295dc262f095a2b58ab34e3c2ec9f5440e4bfca 100644
--- a/paddle/operators/rmsprop_op.cu
+++ b/paddle/operators/rmsprop_op.cu
@@ -1,20 +1,20 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/rmsprop_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(rmsprop,
-                       ops::RmspropOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/rmsprop_op.h b/paddle/operators/rmsprop_op.h
index 7bf2129010f994966d79ef11d5cec30159b47068..16a561835d02457cf2268f713289001773e63d6c 100644
--- a/paddle/operators/rmsprop_op.h
+++ b/paddle/operators/rmsprop_op.h
@@ -24,7 +24,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class RmspropOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -51,7 +51,7 @@ class RmspropOpKernel : public framework::OpKernel<T> {
     auto p_out = EigenVector<T>::Flatten(*param_out);
     auto mom_out = EigenVector<T>::Flatten(*moment_out);
     auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<int, 1> grad_dsize(grad->numel());
 
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
deleted file mode 100644
index ee61ea300c33722471189d06eb09f67a083d2a4d..0000000000000000000000000000000000000000
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/rnn/recurrent_op_utils.h"
-
-namespace paddle {
-namespace operators {
-namespace rnn {
-
-namespace f = paddle::framework;
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-void SegmentInputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<std::string>& inlinks,
-                   const size_t seq_len) {
-  PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided.");
-  for (size_t i = 0; i < inlinks.size(); ++i) {
-    // global inputs
-    auto input_var = step_scopes[0]->parent().FindVar(inlinks[i]);
-    PADDLE_ENFORCE_NOT_NULL(input_var, "input link [%s] is not in scope.",
-                            inlinks[i]);
-
-    LoDTensor* input = input_var->GetMutable<LoDTensor>();
-    f::DDim dims = input->dims();
-    PADDLE_ENFORCE_EQ(static_cast<size_t>(dims[0]), seq_len,
-                      "all the inputs be the same length");
-    f::DDim step_dims = slice_ddim(dims, 1, dims.size());
-    for (size_t j = 0; j < seq_len; j++) {
-      Tensor* step_input =
-          step_scopes[j]->Var(inlinks[i])->GetMutable<Tensor>();
-      // The input of operators of each step is Tensor here.
-      // Maybe need to modify Slice function.
-      *step_input = input->Slice(j, j + 1);
-      step_input->Resize(step_dims);
-    }
-  }
-}
-
-void ConcatOutputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<std::string>& outlinks,
-                   const size_t seq_len, const platform::DeviceContext& ctx) {
-  for (size_t i = 0; i < outlinks.size(); i++) {
-    auto* output_var = step_scopes[0]->parent().FindVar(outlinks[i]);
-    PADDLE_ENFORCE_NOT_NULL(output_var, "output link [%s] is not in scope.",
-                            outlinks[i]);
-    LoDTensor* output = output_var->GetMutable<LoDTensor>();
-
-    auto* step_scope_var = step_scopes[0]->FindVar(outlinks[i]);
-    PADDLE_ENFORCE_NOT_NULL(step_scope_var, "%s not in scope", outlinks[i]);
-    f::DDim step_dims =
-        step_scope_var->template GetMutable<LoDTensor>()->dims();
-    std::vector<int64_t> dims_vec = vectorize(step_dims);
-    dims_vec.insert(dims_vec.begin(), seq_len);
-    output->Resize(f::make_ddim(dims_vec));
-    output->mutable_data<float>(platform::CPUPlace());
-    for (size_t j = 0; j < seq_len; j++) {
-      LoDTensor* step_output =
-          step_scopes[j]->FindVar(outlinks[i])->GetMutable<LoDTensor>();
-      // TODO(luotao02) data type and platform::DeviceContext() should set
-      // correctly
-      (output->Slice(j, j + 1))
-          .CopyFrom(*step_output, platform::CPUPlace(), ctx);
-    }
-  }
-}
-
-void LinkMemories(const std::vector<Scope*>& scopes,
-                  const std::vector<rnn::StateAttr>& memories,
-                  const size_t step_id, const int offset) {
-  PADDLE_ENFORCE_LT(step_id, scopes.size(),
-                    "step [%d] is out of range of step scopes' size [%d]",
-                    step_id, scopes.size());
-  PADDLE_ENFORCE_GE(static_cast<int>(step_id) + offset, 0,
-                    "offset [%d] must be large than -[%d]", offset, step_id);
-  PADDLE_ENFORCE_LT(
-      step_id + offset, scopes.size(),
-      "offset [%d] is out of range, it must be less than (%d - %d)", offset,
-      scopes.size(), step_id);
-  auto* scope = scopes[step_id];
-  auto* linked_scope = scopes[step_id + offset];
-  for (auto& attr : memories) {
-    auto* mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>();
-    auto* linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>();
-    mem->Resize(linked_mem->dims());
-    mem->ShareDataWith(*linked_mem);
-  }
-}
-
-void InitArgument(const ArgumentName& name, Argument* arg,
-                  const framework::OperatorBase& op, bool is_grad) {
-  arg->step_scopes =
-      is_grad ? op.Input(name.step_scopes) : op.Output(name.step_scopes);
-  arg->inlinks = op.Inputs(name.inlinks);
-  arg->outlinks = op.Outputs(name.outlinks);
-
-  auto& boot_memories = is_grad ? op.Outputs(name.initial_states)
-                                : op.Inputs(name.initial_states);
-  // attributes
-  auto& memories = op.Attr<std::vector<std::string>>(name.states);
-  auto& pre_memories = op.Attr<std::vector<std::string>>(name.ex_states);
-
-  PADDLE_ENFORCE(memories.size() == boot_memories.size(),
-                 "the size of states, initial_states don't match:%d,%d",
-                 memories.size(), boot_memories.size());
-  PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(),
-                 "the size of ex_states, initial_states don't match:%d,%d",
-                 pre_memories.size(), boot_memories.size());
-  PADDLE_ENFORCE(memories.size() > 0, "more than 1 states should be set");
-
-  for (size_t i = 0; i < memories.size(); ++i) {
-    rnn::StateAttr mem_attr;
-    mem_attr.var = memories[i];
-    mem_attr.pre_var = pre_memories[i];
-    mem_attr.boot_var = boot_memories[i];
-    (arg->states).push_back(mem_attr);
-  }
-}
-
-}  // namespace rnn
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h
deleted file mode 100644
index fb0e158e07745d58c6211d33e385b324e492b95e..0000000000000000000000000000000000000000
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include <string>
-
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-namespace rnn {
-
-using Scope = framework::Scope;
-
-/**
- * Memory of a RNN (same as the role of `Momory` in PaddlePaddle).
- *
- * Memory attributes cached by this op, dims will be infered from
- * boot memories in father scope. Other attributes are copied from Op's proto
- * attributes.
- */
-struct StateAttr {
-  // name of current state variable
-  std::string var;
-  // name of previous step's state variable
-  std::string pre_var;
-  // name of the variables to init this memory (same role of `boot_layer` in
-  // PaddlePaddle), which is store in father's scope.
-  std::string boot_var;
-};
-
-struct Argument {
-  std::string step_net;
-  std::string step_scopes;
-  std::vector<std::string> inlinks;
-  std::vector<std::string> outlinks;
-  std::vector<rnn::StateAttr> states;
-};
-
-struct ArgumentName {
-  std::string step_net;
-  std::string step_scopes;
-  std::string inlinks;
-  std::string outlinks;
-  std::string states;          // the memory name
-  std::string ex_states;       // the previous memory name
-  std::string initial_states;  // the boot memory name
-};
-
-/**
- * Prepare inputs for each step net.
- */
-void SegmentInputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<std::string>& inlinks,
-                   const size_t seq_len);
-
-/**
- * Process outputs of step nets and merge to variables.
- */
-void ConcatOutputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<std::string>& outlinks,
-                   const size_t seq_len, const platform::DeviceContext& ctx);
-
-void LinkMemories(const std::vector<Scope*>& step_scopes,
-                  const std::vector<StateAttr>& memories, const size_t step_id,
-                  const int offset);
-
-void InitArgument(const ArgumentName& name, Argument* arg,
-                  const framework::OperatorBase& op, bool is_grad = false);
-
-}  // namespace rnn
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/rnn_memory_helper_op.cc b/paddle/operators/rnn_memory_helper_op.cc
index b621c7f1ba3f9e9613dea5bc98ef74c7c6dae9a0..eb55ed6a05b51d7a6c63d16fcf5aff73f6744903 100644
--- a/paddle/operators/rnn_memory_helper_op.cc
+++ b/paddle/operators/rnn_memory_helper_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
@@ -25,7 +25,7 @@ class RNNMemoryHelperOp : public framework::OperatorBase {
                     const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto mem_var_name = Input("X");
     auto *mem_var = scope.FindVar(mem_var_name);
     PADDLE_ENFORCE(mem_var != nullptr,
@@ -57,15 +57,14 @@ class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase {
 
 class RNNMemoryHelperOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RNNMemoryHelperOpInfoMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+  RNNMemoryHelperOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "");
     AddOutput("Out", "");
-    AddAttr<int>("data_type",
+    AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
     AddComment("");
   }
 };
@@ -78,7 +77,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
                         const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto out_grad_var_name = Input(framework::GradVarName("Out"));
     auto *out_grad_var = scope.FindVar(out_grad_var_name);
 
@@ -95,13 +94,13 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
       auto &in_var_tensor = in_var->Get<framework::LoDTensor>();
 
       framework::AttributeMap attrs;
-      attrs["data_type"] = framework::ToDataType(in_var_tensor.type());
+      attrs["dtype"] = framework::ToDataType(in_var_tensor.type());
       attrs["shape"] = framework::vectorize2int(in_var_tensor.dims());
       attrs["value"] = 0.0f;
 
       auto zero_op = framework::OpRegistry::CreateOp(
           "fill_constant", {}, {{"Out", {in_grad_var_name}}}, attrs);
-      zero_op->Run(scope, dev_ctx);
+      zero_op->Run(scope, dev_place);
     } else {
       auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>();
       auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>();
@@ -114,17 +113,16 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
 class RNNMemoryHelperGradOpInfoMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  RNNMemoryHelperGradOpInfoMaker(framework::OpProto *proto,
-                                 framework::OpAttrChecker *op_checker)
+  RNNMemoryHelperGradOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(framework::GradVarName("Out"), "");
     AddInput("X", "");
     AddInput("Out", "");
     AddOutput(framework::GradVarName("X"), "");
-    AddAttr<int>("data_type",
+    AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
     AddComment("");
   }
 };
diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a7351f11c5da7b850681346942ad699aba85a8e0
--- /dev/null
+++ b/paddle/operators/roi_pool_op.cc
@@ -0,0 +1,165 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/roi_pool_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static constexpr int kROISize = 5;
+
+class ROIPoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ROIs"),
+                   "Input(ROIs) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Argmax"),
+                   "Output(Argmax) of ROIPoolOp should not be null.");
+    auto input_dims = ctx->GetInputDim("X");
+    auto rois_dims = ctx->GetInputDim("ROIs");
+
+    PADDLE_ENFORCE(input_dims.size() == 4,
+                   "The format of input tensor is NCHW.");
+    PADDLE_ENFORCE(rois_dims.size() == 2,
+                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
+                   "given as [[batch_id, x1, y1, x2, y2], …].");
+    PADDLE_ENFORCE(rois_dims[1] == kROISize,
+                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
+                   "given as [[batch_id, x1, y1, x2, y2], …].");
+
+    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
+    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
+    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
+
+    PADDLE_ENFORCE_GT(pooled_height, 0,
+                      "The pooled output height must greater than 0");
+    PADDLE_ENFORCE_GT(pooled_width, 0,
+                      "The pooled output width must greater than 0");
+    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
+                      "The spatial scale must greater than 0");
+
+    auto out_dims = input_dims;
+    out_dims[0] = rois_dims[0];
+    out_dims[1] = input_dims[1];
+    out_dims[2] = pooled_height;
+    out_dims[3] = pooled_width;
+
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->SetOutputDim("Argmax", out_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class ROIPoolGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ROIPoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor), "
+             "the input of ROIPoolOp. "
+             "The format of input tensor is NCHW. Where N is batch size, "
+             "C is the number of input channels, "
+             "H is the height of the feature, and "
+             "W is the width of the feature.");
+    AddInput("ROIs",
+             "(Tensor), "
+             "ROIs (Regions of Interest) to pool over. "
+             "should be a 2-D tensor of shape (num_rois, 5)"
+             "given as [[batch_id, x1, y1, x2, y2], …]. "
+             "Where batch_id is the id of the data, "
+             "(x1, y1) is the top left coordinates, and "
+             "(x2, y2) is the bottom right coordinates.");
+    AddOutput("Out",
+              "(Tensor), "
+              "The output of ROIPoolOp is a 4-D tensor with shape "
+              "(num_rois, channels, pooled_h, pooled_w).");
+    AddOutput("Argmax",
+              "(Tensor), "
+              "Argmaxes corresponding to indices in X used "
+              "for gradient computation. Only output "
+              "if arg “is_test” is false.")
+        .AsIntermediate();
+    AddAttr<float>("spatial_scale",
+                   "(float, default 1.0), "
+                   "Multiplicative spatial scale factor "
+                   "to translate ROI coords from their input scale "
+                   "to the scale used when pooling.")
+        .SetDefault(1.0);
+    AddAttr<int>("pooled_height",
+                 "(int, default 1), "
+                 "The pooled output height.")
+        .SetDefault(1);
+    AddAttr<int>("pooled_width",
+                 "(int, default 1), "
+                 "The pooled output width.")
+        .SetDefault(1);
+    AddComment(R"DOC(
+ROIPool operator
+
+ROI Pooling for Faster-RCNN. The link below is a further introduction: 
+https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
+            ops::ROIPoolGradOp);
+REGISTER_OP_CPU_KERNEL(
+    roi_pool,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    roi_pool_grad,
+    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/roi_pool_op.cu b/paddle/operators/roi_pool_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a874befe4d12029afa9ce55230da22cb048000aa
--- /dev/null
+++ b/paddle/operators/roi_pool_op.cu
@@ -0,0 +1,209 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/roi_pool_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+static constexpr int kROISize = 5;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUROIPoolForward(const int nthreads, const T* input_data,
+                                  const int64_t* input_rois,
+                                  const float spatial_scale, const int channels,
+                                  const int height, const int width,
+                                  const int pooled_height,
+                                  const int pooled_width, T* output_data,
+                                  int64_t* argmax_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = offset_input_rois[0];
+    int roi_start_w = round(offset_input_rois[1] * spatial_scale);
+    int roi_start_h = round(offset_input_rois[2] * spatial_scale);
+    int roi_end_w = round(offset_input_rois[3] * spatial_scale);
+    int roi_end_h = round(offset_input_rois[4] * spatial_scale);
+
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
+    int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
+    int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    hstart = min(max(hstart + roi_start_h, 0), height);
+    hend = min(max(hend + roi_start_h, 0), height);
+    wstart = min(max(wstart + roi_start_w, 0), width);
+    wend = min(max(wend + roi_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
+    int maxidx = -1;
+    const T* offset_input_data =
+        input_data + (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int input_data_index = h * width + w;
+        if (offset_input_data[input_data_index] > maxval) {
+          maxval = offset_input_data[input_data_index];
+          maxidx = input_data_index;
+        }
+      }
+    }
+    output_data[index] = maxval;
+    if (argmax_data) {
+      argmax_data[index] = maxidx;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GPUROIPoolBackward(
+    const int nthreads, const int64_t* input_rois, const T* output_grad,
+    const int64_t* argmax_data, const int num_rois, const float spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, T* input_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = offset_input_rois[0];
+    int input_offset = (roi_batch_ind * channels + c) * height * width;
+    int output_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_output_grad = output_grad + output_offset;
+    T* offset_input_grad = input_grad + input_offset;
+    const int64_t* offset_argmax_data = argmax_data + output_offset;
+
+    int argmax = offset_argmax_data[ph * pooled_width + pw];
+    if (argmax != -1) {
+      platform::CudaAtomicAdd(
+          offset_input_grad + argmax,
+          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
+    }
+  }
+}
+
+template <typename Place, typename T>
+class GPUROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<Tensor>("ROIs");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto* argmax = ctx.Output<Tensor>("Argmax");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    auto in_stride = framework::stride(in_dims);
+    int channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+
+    size_t rois_num = rois->dims()[0];
+    if (rois_num == 0) return;
+
+    int output_size = out->numel();
+    int blocks = NumBlocks(output_size);
+    int threads = kNumCUDAThreads;
+
+    GPUROIPoolForward<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale,
+        channels, height, width, pooled_height, pooled_width,
+        out->mutable_data<T>(ctx.GetPlace()),
+        argmax->mutable_data<int64_t>(ctx.GetPlace()));
+  }
+};
+
+template <typename Place, typename T>
+class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<Tensor>("ROIs");
+    auto* argmax = ctx.Input<Tensor>("Argmax");
+
+    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    size_t rois_num = rois->dims()[0];
+    int channels = in->dims()[1];
+    int height = in->dims()[2];
+    int width = in->dims()[3];
+
+    if (x_grad) {
+      x_grad->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<Place, T> set_zero;
+      set_zero(ctx.cuda_device_context(), x_grad, static_cast<T>(0));
+
+      int output_grad_size = out_grad->numel();
+      int blocks = NumBlocks(output_grad_size);
+      int threads = kNumCUDAThreads;
+
+      if (output_grad_size > 0) {
+        GPUROIPoolBackward<
+            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+            output_grad_size, rois->data<int64_t>(), out_grad->data<T>(),
+            argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
+            width, pooled_height, pooled_width,
+            x_grad->mutable_data<T>(ctx.GetPlace()));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    roi_pool,
+    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    roi_pool_grad,
+    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/roi_pool_op.h b/paddle/operators/roi_pool_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..09a9d3d870c1066f1c6f780c4b3203679e9e7505
--- /dev/null
+++ b/paddle/operators/roi_pool_op.h
@@ -0,0 +1,184 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class CPUROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::Tensor>("ROIs");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* argmax = ctx.Output<framework::Tensor>("Argmax");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num = rois->dims()[0];
+
+    auto in_stride = framework::stride(in_dims);
+    auto argmax_stride = framework::stride(argmax->dims());
+    auto roi_stride = framework::stride(rois->dims());
+    auto out_stride = framework::stride(out->dims());
+
+    const T* input_data = in->data<T>();
+    const int64_t* rois_data = rois->data<int64_t>();
+    T* output_data = out->mutable_data<T>(ctx.GetPlace());
+    int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
+
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_id = rois_data[0];
+      PADDLE_ENFORCE_GE(roi_batch_id, 0);
+      PADDLE_ENFORCE_LT(roi_batch_id, batch_size);
+      rois_data += roi_stride[0];
+    }
+
+    rois_data = rois->data<int64_t>();
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_id = rois_data[0];
+      int roi_start_w = round(rois_data[1] * spatial_scale);
+      int roi_start_h = round(rois_data[2] * spatial_scale);
+      int roi_end_w = round(rois_data[3] * spatial_scale);
+      int roi_end_h = round(rois_data[4] * spatial_scale);
+
+      // Force malformed ROIs to be 1x1
+      int roi_height = std::max(roi_end_h - roi_start_h + 1, 1);
+      int roi_width = std::max(roi_end_w - roi_start_w + 1, 1);
+
+      const float bin_size_h =
+          static_cast<float>(roi_height) / static_cast<float>(pooled_height);
+      const float bin_size_w =
+          static_cast<float>(roi_width) / static_cast<float>(pooled_width);
+
+      const T* batch_data = input_data + roi_batch_id * in_stride[0];
+
+      for (int c = 0; c < channels; ++c) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            //  Compute pooling region for this output unit:
+            //  start (included) = floor(ph * roi_height / pooled_height_)
+            //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
+            int hstart =
+                static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
+            int wstart =
+                static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
+            int hend =
+                static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
+            int wend =
+                static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
+
+            hstart = std::min(std::max(hstart + roi_start_h, 0), height);
+            hend = std::min(std::max(hend + roi_start_h, 0), height);
+            wstart = std::min(std::max(wstart + roi_start_w, 0), width);
+            wend = std::min(std::max(wend + roi_start_w, 0), width);
+
+            const int pool_index = ph * pooled_width + pw;
+
+            // Define an empty pooling region to be zero
+            bool is_empty = (hend <= hstart) || (wend <= wstart);
+            output_data[pool_index] =
+                is_empty ? 0 : -std::numeric_limits<T>::max();
+            argmax_data[pool_index] = -1;
+
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                const int index = h * width + w;
+                if (batch_data[index] > output_data[pool_index]) {
+                  output_data[pool_index] = batch_data[index];
+                  argmax_data[pool_index] = index;
+                }
+              }
+            }
+          }
+        }
+
+        batch_data += in_stride[1];
+        output_data += out_stride[1];
+        argmax_data += argmax_stride[1];
+      }
+      // Increment ROI data pointer
+      rois_data += roi_stride[0];
+    }
+    return;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::Tensor>("ROIs");
+    auto* argmax = ctx.Input<framework::Tensor>("Argmax");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+
+    if (in_grad) {
+      const int64_t* rois_data = rois->data<int64_t>();
+      const T* out_grad_data = out_grad->data<T>();
+      const int64_t* argmax_data = argmax->data<int64_t>();
+      T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(ctx.template device_context<DeviceContext>(), in_grad,
+               static_cast<T>(0));
+
+      auto in_stride = framework::stride(in->dims());
+      auto argmax_stride = framework::stride(argmax->dims());
+      auto roi_stride = framework::stride(rois->dims());
+      auto out_stride = framework::stride(out_grad->dims());
+
+      int rois_num = rois->dims()[0];
+      int channels = in->dims()[1];
+
+      for (int n = 0; n < rois_num; ++n) {
+        int roi_batch_idx = rois_data[0];
+        T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < pooled_height; ++ph) {
+            for (int pw = 0; pw < pooled_width; ++pw) {
+              int pool_index = ph * pooled_width + pw;
+              if (argmax_data[pool_index] >= 0) {
+                auto index = argmax_data[pool_index];
+                batch_grad_data[index] += out_grad_data[pool_index];
+              }
+            }
+          }
+          batch_grad_data += in_stride[1];
+          out_grad_data += out_stride[1];
+          argmax_data += argmax_stride[1];
+        }
+        rois_data += roi_stride[0];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/row_conv_op.cc b/paddle/operators/row_conv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..68f4e3531566fd346055404d45651c2b53ebe31b
--- /dev/null
+++ b/paddle/operators/row_conv_op.cc
@@ -0,0 +1,259 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/row_conv_op.h"
+#include "paddle/framework/eigen.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+class RowConvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of RowConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                   "Input(Filter) of RowConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of RowConvOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto filter_dims = ctx->GetInputDim("Filter");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(filter_dims.size(), 2, "Input(Y)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(
+        x_dims[1], filter_dims[1],
+        "The 2nd dimension of Input(X) and Input(Filter) should be same.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class RowConvGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                   "Input(Filter) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Gradient of output(Out) should not be null.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      auto x_dims = ctx->GetInputDim("X");
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+
+    auto filter_grad_name = framework::GradVarName("Filter");
+    if (ctx->HasOutput(filter_grad_name)) {
+      auto filter_dims = ctx->GetInputDim("Filter");
+      ctx->SetOutputDim(filter_grad_name, filter_dims);
+    }
+  }
+};
+
+class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RowConvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor), the input(X) is a LodTensor, which supports "
+             "variable time-length input sequences. The underlying tensor "
+             "in this LoDTensor is a matrix with shape (T x N), where T "
+             "is the total time steps in this mini-batch and N is the input "
+             "data dimension.");
+    AddInput("Filter",
+             "(Tensor), the input(Filter) is a learnable parameter. It "
+             "is a 2-D tensor with shape (future_context x N), where, "
+             "future_context is the future context length and N is the data "
+             "dimension.");
+    AddOutput("Out",
+              "(LoDTensor), the output(Out) is a LodTensor, which supports "
+              "variable time-length input sequences. The underlying tensor "
+              "in this LodTensor is a matrix with shape T x N, i.e., the "
+              "same shape as X.");
+    AddComment(R"DOC(
+Row-convolution Operator.
+
+The row convolution is called lookahead convolution.  This operator was 
+introduced in the following paper for DeepSpeech2:
+http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf 
+
+The main motivation is that a bidirectional RNN, useful in DeepSpeech 
+like speech models, learns representation for a sequence by performing a 
+forward and a backward pass through the entire sequence. However, unlike 
+unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
+and low-latency setting. The lookahead convolution incorporates information 
+from future subsequences in a computationally efficient manner to improve 
+unidirectional recurrent neural networks. The row convolution operator is 
+different from the 1D sequence convolution, and is computed as follows:
+
+Given an input sequence $in$ of length $t$ and input dimension $d$, 
+and a filter ($W$) of size $context \times d$, 
+the output sequence is convolved as:
+
+$$
+out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :}
+$$
+
+)DOC");
+  }
+};
+
+template <typename T>
+class RowConvKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<LoDTensor>("X");
+    auto *filter = context.Input<Tensor>("Filter");
+    auto *out = context.Output<LoDTensor>("Out");
+
+    out->mutable_data<T>(context.GetPlace());
+
+    auto batch_indices = x->lod()[0];
+    auto input_dim = x->dims()[1];  // 'in' is of size T x N
+    size_t num_sequence = batch_indices.size() - 1;
+
+    auto future_context = filter->dims()[0];
+    auto weights = EigenMatrix<T>::From(*filter);
+
+    for (size_t i = 0; i < num_sequence; i++) {
+      int start = static_cast<int>(batch_indices[i]);
+      int end = static_cast<int>(batch_indices[i + 1]);
+      int current_timesteps = end - start;
+      Tensor cur_input_sequence =
+          x->Slice(start, end);  // Current input sequence
+      Tensor cur_output_sequence =
+          out->Slice(start, end);  // Current output sequence
+      auto cip_seq = EigenMatrix<T>::From(cur_input_sequence);
+      auto cot_seq = EigenMatrix<T>::From(cur_output_sequence);
+
+      for (int k = 0; k < current_timesteps;
+           k++) {  // For different time steps in the same sequence
+        for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+             w++) {
+          for (int d = 0; d < input_dim; d++) {
+            if (w == 0) {
+              cot_seq(k, d) = weights(w, d) * cip_seq(k + w, d);
+            } else {
+              cot_seq(k, d) += weights(w, d) * cip_seq(k + w, d);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class RowConvGradKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<LoDTensor>("X");
+    auto *filter = context.Input<Tensor>("Filter");
+    auto *d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto *dx = context.Output<LoDTensor>(framework::GradVarName("X"));
+    auto *d_filter = context.Output<Tensor>(framework::GradVarName("Filter"));
+
+    auto input_dim = x->dims()[1];  // 'x' is of size T x N
+    auto batch_indices = x->lod()[0];
+    size_t num_sequence = batch_indices.size() - 1;
+    auto future_context = filter->dims()[0];
+
+    if (d_filter) {
+      d_filter->mutable_data<T>(context.GetPlace());
+      auto dweights =
+          EigenMatrix<T>::From(*d_filter);  // Gradient of weight matrix
+      dweights.setZero();
+
+      for (size_t i = 0; i < num_sequence; i++) {  // For different sequences
+        int start = static_cast<int>(batch_indices[i]);
+        int end = static_cast<int>(batch_indices[i + 1]);
+
+        Tensor cur_input = x->Slice(start, end);  // Current input sequence
+        Tensor cur_doutput =
+            d_out->Slice(start, end);  // Current output grad sequence
+
+        auto cur_ip = EigenMatrix<T>::From(cur_input);
+        auto cur_dout = EigenMatrix<T>::From(cur_doutput);
+        int current_timesteps = end - start;
+
+        for (int k = 0; k < current_timesteps;
+             k++) {  // For different time steps in the same sequence
+          for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+               w++) {
+            // For dweights (Updating the gradient of weight matrix)
+            for (int d = 0; d < input_dim; d++) {
+              dweights(w, d) += cur_ip(k + w, d) * cur_dout(k, d);
+            }
+          }
+        }
+      }
+    }
+
+    if (dx) {
+      dx->mutable_data<T>(context.GetPlace());
+      auto weights = EigenMatrix<T>::From(*filter);
+      for (size_t i = 0; i < num_sequence; i++) {  // For different sequences
+        int start = static_cast<int>(batch_indices[i]);
+        int end = static_cast<int>(batch_indices[i + 1]);
+
+        Tensor cur_doutput =
+            d_out->Slice(start, end);  // Current output grad sequence
+        Tensor cur_dinput =
+            dx->Slice(start, end);  // Current input grad sequence
+
+        auto cur_dout = EigenMatrix<T>::From(cur_doutput);
+        auto cur_dip = EigenMatrix<T>::From(cur_dinput);
+        cur_dip.setZero();
+        int current_timesteps = end - start;
+
+        for (int k = 0; k < current_timesteps;
+             k++) {  // For different time steps in the same sequence
+          for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+               w++) {
+            // For dinput (Updating the gradient wrt input)
+            for (int d = 0; d < input_dim; d++) {
+              cur_dip(k + w, d) += weights(w, d) * cur_dout(k, d);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad,
+            ops::RowConvGradOp);
+REGISTER_OP_CPU_KERNEL(
+    row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    row_conv_grad,
+    ops::RowConvGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..41f2c5b9de91ade15b4010f56377675cfd1b611c
--- /dev/null
+++ b/paddle/operators/row_conv_op.cu
@@ -0,0 +1,410 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/row_conv_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using framework::Tensor;
+
+namespace {
+
+inline int DivUp(int x, int y) { return (x + y - 1) / y; }
+
+// Forward prop (shared memory version, for small future_context)
+template <typename T>
+__global__ void RowConvForwardSharedMemory(const T *in, const T *wt,
+                                           int num_sequence, int input_dim,
+                                           int future_context,
+                                           const size_t *batch_indices,
+                                           T *out) {
+  int blx = blockDim.x;
+  int bly = blockDim.y;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int d = blockIdx.x * blx + thx;  // index along input dim
+
+  extern __shared__ T mem[];
+  T *sw = mem;
+
+  if (thy < future_context) {
+    sw[thy * blx + thx] =
+        (d < input_dim) ? wt[thy * input_dim + d] : static_cast<T>(0);
+  }
+  __syncthreads();
+
+  for (size_t i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+           w++) {
+        sum += (d < input_dim)
+                   ? sw[w * blx + thx] * in[(start + k + w) * input_dim + d]
+                   : static_cast<T>(0);
+      }
+      if (d < input_dim) {
+        out[(start + k) * input_dim + d] = sum;
+      }
+    }
+  }
+}
+
+// Forward prop (naive version)
+template <typename T>
+__global__ void RowConvForward(const T *in, const T *wt, int num_sequence,
+                               int input_dim, int future_context,
+                               const size_t *batch_indices, T *out) {
+  int d = blockIdx.x * blockDim.x + threadIdx.x;  // index along input_dim
+  int bly = blockDim.y;
+  int thy = threadIdx.y;
+
+  if (d >= input_dim) return;
+
+  for (size_t i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+           w++) {
+        sum += (wt[w * input_dim + d] * in[(start + k + w) * input_dim + d]);
+      }
+      out[(start + k) * input_dim + d] = sum;
+    }
+  }
+}
+
+// Compute input gradient (shared memory version, for small future_context)
+template <typename T>
+__global__ void RowConvGradInputSharedMemory(const T *dout, const T *wt,
+                                             int num_sequence, int input_dim,
+                                             int future_context,
+                                             const size_t *batch_indices,
+                                             T *din) {
+  int blx = blockDim.x;
+  int bly = blockDim.y;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int d = blockIdx.x * blx + thx;  // index along input dim
+
+  extern __shared__ T mem[];
+  T *sw = mem;
+  if (thy < future_context) {
+    sw[thy * blx + thx] =
+        (d < input_dim) ? wt[thy * input_dim + d] : static_cast<T>(0);
+  }
+  __syncthreads();
+
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) {
+        sum += (d < input_dim)
+                   ? (sw[w * blx + thx] * dout[(k + start - w) * input_dim + d])
+                   : static_cast<T>(0);
+      }
+      if (d < input_dim) {
+        din[(k + start) * input_dim + d] = sum;
+      }
+    }
+  }
+}
+
+// Compute input gradient (Naive version)
+template <typename T>
+__global__ void RowConvGradInput(const T *dout, const T *wt, int num_sequence,
+                                 int input_dim, int future_context,
+                                 const size_t *batch_indices, T *din) {
+  int d = blockIdx.x * blockDim.x + threadIdx.x;  // index along input_dim
+  int bly = blockDim.y;
+  int thy = threadIdx.y;
+
+  if (d >= input_dim) return;
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) {
+        sum += (wt[w * input_dim + d] * dout[(k + start - w) * input_dim + d]);
+      }
+      din[(k + start) * input_dim + d] = sum;
+    }
+  }
+}
+
+// Compute W gradient (small future_context version)
+template <typename T>
+__global__ void RowConvGradFilterImproved(const T *in, const T *dout,
+                                          int num_sequence, int input_dim,
+                                          int future_context, int block_x,
+                                          int block_y,
+                                          const size_t *batch_indices,
+                                          T *dfilter) {
+  int blx = blockDim.x;
+  int bly = blockDim.y;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int gx = blockIdx.x * blx;
+  int d = gx + thx;  // index along input dim
+
+  extern __shared__ T mem[];
+
+  int xdim_sh_in = block_y;
+  int xdim_sh_dout = block_y;
+  // int xdim_sh_dfilter = future_context;
+  int ydim_sh_in = block_x;
+  int ydim_sh_dout = block_x + future_context - 1;
+  int ydim_sh_dfilter = block_y;
+
+  T *sh_in = mem;
+  T *sh_dout = &mem[xdim_sh_in * ydim_sh_in];
+  T *sh_dfilter = &mem[xdim_sh_in * ydim_sh_in + xdim_sh_dout * ydim_sh_dout];
+
+  if (thy < future_context) {
+    sh_dfilter[thy * ydim_sh_dfilter + thx] = static_cast<T>(0);
+  }
+  __syncthreads();
+
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    int scaled_cur_steps =
+        ((current_timesteps + block_x - 1) / block_x) * block_x;
+
+    for (int k = thy; k < scaled_cur_steps; k += block_x) {
+      int pos = start + k;
+      sh_in[thx * ydim_sh_in + thy] =
+          (d < input_dim && pos < end) ? in[pos * input_dim + d] : T(0);
+      sh_dout[thx * ydim_sh_dout + thy + future_context - 1] =
+          (d < input_dim && pos < end) ? dout[pos * input_dim + d] : T(0);
+      __syncthreads();
+
+      if (thy < future_context - 1) {
+        int pos_offset = pos - future_context + 1;
+        sh_dout[thx * ydim_sh_dout + thy] =
+            (d < input_dim && pos_offset >= start)
+                ? dout[pos_offset * input_dim + d]
+                : T(0);
+      }
+      __syncthreads();
+
+      for (int w = 0; w < future_context; w++) {
+        T val = sh_in[thy * ydim_sh_in + thx] *
+                sh_dout[thy * ydim_sh_dout + thx + future_context - 1 - w];
+        __syncthreads();
+
+        for (int offset = 16; offset > 0;
+             offset = offset / 2) {  // blockDim.x is 32.
+          val += __shfl_down(val, offset);
+        }
+        __syncthreads();
+
+        if (thx == 0) {
+          sh_dfilter[w * ydim_sh_dfilter + thy] += val;
+        }
+        __syncthreads();
+      }
+    }
+  }
+  for (int w = thy; (w < future_context) && (d < input_dim); w += bly) {
+    dfilter[w * input_dim + d] += sh_dfilter[w * ydim_sh_dfilter + thx];
+  }
+}
+
+// Compute weight(filter) gradient
+template <typename T>
+__global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence,
+                                  int input_dim, int future_context,
+                                  int block_x, int block_y,
+                                  const size_t *batch_indices, T *dfilter) {
+  int blx = blockDim.x;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int gx = blockIdx.x * blx;
+  int d = gx + thx;  // index along input dim
+  extern __shared__ T mem[];
+  T *sh_in = mem;
+  T *sh_dout = &mem[block_x * block_y];
+
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    int scaled_cur_steps =
+        ((current_timesteps + block_x - 1) / block_x) * block_x;
+
+    for (int k = thy; k < scaled_cur_steps; k += block_x) {
+      int pos = start + k;
+      sh_in[thx * block_y + thy] =
+          (d < input_dim && pos < end) ? in[pos * input_dim + d] : 0.0;
+      __syncthreads();
+
+      for (int w = 0; w < future_context; w++) {
+        sh_dout[thx * block_y + thy] =
+            (d < input_dim && (k - w) >= 0 && (k - w) < current_timesteps)
+                ? dout[(pos - w) * input_dim + d]
+                : 0.0;
+        __syncthreads();
+
+        T val = sh_in[thy * block_y + thx] * sh_dout[thy * block_y + thx];
+        __syncthreads();
+
+        for (int offset = 16; offset > 0;
+             offset = offset / 2) {  // blockDim.x is 32.
+          val += __shfl_down(val, offset);
+        }
+        __syncthreads();
+
+        if (thx == 0 && (gx + thy) < input_dim) {
+          dfilter[w * input_dim + gx + thy] += val;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
+
+template <typename T>
+class RowConvKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<LoDTensor>("X");
+    auto *Filter = context.Input<Tensor>("Filter");
+    auto *Out = context.Output<LoDTensor>("Out");
+
+    const T *in = X->data<T>();
+    const T *weight = Filter->data<T>();
+    T *out = Out->mutable_data<T>(context.GetPlace());
+
+    auto batch_indices = X->lod()[0];
+    int input_dim = X->dims()[1];
+    int num_sequence = batch_indices.size() - 1;
+    int future_context = Filter->dims()[0];
+    size_t *idx = batch_indices.data();
+    auto stream = context.cuda_device_context().stream();
+
+    if (future_context <= 32) {
+      dim3 block_dim = dim3(32, 32);
+      dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+      int mem_per_block = (future_context * block_dim.x) * sizeof(T);
+      RowConvForwardSharedMemory<
+          T><<<grid_dim, block_dim, mem_per_block, stream>>>(
+          in, weight, num_sequence, input_dim, future_context, idx, out);
+    } else {
+      dim3 block_dim = dim3(32, 32);
+      dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+      RowConvForward<T><<<grid_dim, block_dim, 0, stream>>>(
+          in, weight, num_sequence, input_dim, future_context, idx, out);
+    }
+  }
+};
+
+template <typename T>
+class RowConvGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<LoDTensor>("X");
+    auto *Filter = context.Input<Tensor>("Filter");
+    auto *dOut = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    const T *in = X->data<T>();
+    const T *weights = Filter->data<T>();
+    const T *dout = dOut->data<T>();
+
+    Tensor *dX = context.Output<LoDTensor>(framework::GradVarName("X"));
+    Tensor *dFilter = context.Output<Tensor>(framework::GradVarName("Filter"));
+
+    auto batch_indices = X->lod()[0];
+    int input_dim = X->dims()[1];
+    int num_sequence = batch_indices.size() - 1;
+    int future_context = Filter->dims()[0];
+    size_t *idx = batch_indices.data();
+
+    auto &device_ctx = context.cuda_device_context();
+    math::SetConstant<platform::CUDADeviceContext, T> zero;
+
+    if (dFilter) {
+      T *dfilter = dFilter->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, dFilter, static_cast<T>(0.0));
+
+      if (future_context <= 32) {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        int block_x = block_dim.x;
+        int block_y = block_dim.y;
+        int mem_per_block =
+            (block_y * block_x + block_y * (block_x + future_context - 1) +
+             future_context * block_y) *
+            sizeof(T);
+        RowConvGradFilterImproved<
+            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
+            idx, dfilter);
+      } else {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        int block_x = block_dim.x;
+        int block_y = block_dim.y;
+        int mem_per_block =
+            (block_x * block_y * 2) * sizeof(T);  // For 2 arrays of size 32x32
+        RowConvGradFilter<
+            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
+            idx, dfilter);
+      }
+    }
+
+    if (dX) {
+      T *din = dX->mutable_data<T>(context.GetPlace());
+      if (future_context <= 32) {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        int mem_per_block = (future_context * block_dim.x) * sizeof(T);
+        RowConvGradInputSharedMemory<
+            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+            dout, weights, num_sequence, input_dim, future_context, idx, din);
+      } else {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        RowConvGradInput<T><<<grid_dim, block_dim, 0, device_ctx.stream()>>>(
+            dout, weights, num_sequence, input_dim, future_context, idx, din);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    row_conv, ops::RowConvKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    row_conv_grad,
+    ops::RowConvGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/row_conv_op.h b/paddle/operators/row_conv_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..10d435ab080851713ee08a491c43aad1549f6fbb
--- /dev/null
+++ b/paddle/operators/row_conv_op.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class RowConvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+
+template <typename DeviceContext, typename T>
+class RowConvGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/save_load_op_test.cc b/paddle/operators/save_load_op_test.cc
index a57466a48d4d6016fe2618d19fdca4c4f667124a..40103d864fb58804b39ca5f3c63e802a430ce886 100644
--- a/paddle/operators/save_load_op_test.cc
+++ b/paddle/operators/save_load_op_test.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "gtest/gtest.h"
 #include "paddle/framework/op_registry.h"
@@ -21,7 +21,7 @@ USE_NO_KERNEL_OP(load);
 TEST(SaveLoadOp, CPU) {
   paddle::framework::Scope scope;
   paddle::platform::CPUPlace place;
-  paddle::platform::CPUDeviceContext ctx(place);
+
   auto var = scope.Var("test_var");
   auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
   tensor->Resize({10, 10});
@@ -42,13 +42,13 @@ TEST(SaveLoadOp, CPU) {
 
   auto save_op = paddle::framework::OpRegistry::CreateOp(
       "save", {{"X", {"test_var"}}}, {}, attrs);
-  save_op->Run(scope, ctx);
+  save_op->Run(scope, place);
 
   auto load_var = scope.Var("out_var");
   auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
   auto load_op = paddle::framework::OpRegistry::CreateOp(
       "load", {}, {{"Out", {"out_var"}}}, attrs);
-  load_op->Run(scope, ctx);
+  load_op->Run(scope, place);
   int* actual = target->data<int>();
   for (int64_t i = 0; i < tensor->numel(); ++i) {
     EXPECT_EQ(expect[i], actual[i]);
diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc
index 56909fb65f44ad00314103e21bee9535fbd59317..4b1cbe88836e340c94f797806243a6768410ed3d 100644
--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <stdint.h>
 #include <sys/stat.h>
@@ -21,6 +21,7 @@
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -62,7 +63,7 @@ class SaveOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     auto overwrite = Attr<bool>("overwrite");
 
@@ -89,79 +90,17 @@ class SaveOp : public framework::OperatorBase {
 
     auto &tensor = var->Get<framework::LoDTensor>();
 
-    {  // the 1st field, uint32_t version
-      constexpr uint32_t version = 0;
-      fout.write(reinterpret_cast<const char *>(&version), sizeof(version));
-    }
-    {  // the 2nd field, tensor description
-       // int32_t  size
-       // void*    protobuf message
-      framework::TensorDesc desc;
-      desc.set_data_type(framework::ToDataType(tensor.type()));
-      auto dims = framework::vectorize(tensor.dims());
-      auto *pb_dims = desc.mutable_dims();
-      pb_dims->Resize(static_cast<int>(dims.size()), 0);
-      std::copy(dims.begin(), dims.end(), pb_dims->begin());
-      int32_t size = desc.ByteSize();
-      fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-      auto out = desc.SerializeAsString();
-      fout.write(out.data(), size);
-    }
-    {  // the 3rd field, tensor data
-      uint64_t size = tensor.memory_size();
-      auto *data_ptr = tensor.data<void>();
-      PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
-                     "Index overflow when writing tensor");
-      if (platform::is_gpu_place(tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
-        constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
-        std::unique_ptr<char[]> buf(new char[kBufSize]);
-        auto &gpu_dev_ctx =
-            static_cast<const platform::CUDADeviceContext &>(dev_ctx);
-        platform::CPUPlace cpu;
-        uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
-        while (size != 0) {
-          size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-          memory::Copy(cpu, buf.get(),
-                       boost::get<platform::GPUPlace>(tensor.place()),
-                       reinterpret_cast<const void *>(data), size_to_write,
-                       gpu_dev_ctx.stream());
-          gpu_dev_ctx.Wait();
-          fout.write(buf.get(), size_to_write);
-          data += size_to_write;
-          size -= size_to_write;
-        }
-#else
-        PADDLE_THROW("Unexpected branch");
-#endif
-      } else {
-        fout.write(static_cast<const char *>(data_ptr),
-                   static_cast<std::streamsize>(size));
-      }
-    }
-    {  // the 4th field, lod information
-       // uint64_t lod_level
-       // uint64_t lod_level_1 size in byte.
-       // int*     lod_level_1 data
-       // ...
-      auto lod = tensor.lod();
-      uint64_t size = lod.size();
-      fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-
-      for (auto &each : lod) {
-        size = each.size() * sizeof(framework::LoD::value_type::value_type);
-        fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-        fout.write(reinterpret_cast<const char *>(each.data()),
-                   static_cast<std::streamsize>(size));
-      }
-    }
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    framework::SerializeToStream(fout, tensor, dev_ctx);
   }
 };
 
 class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SaveOpProtoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  SaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor ) Input tensor to be saved");
     AddComment(R"DOC(
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index 5745580504fb9bda551f21665bff5c65ae82aeb9..f634ebe9a2a4648bd08f00af635ef22e8d86a8de 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/scale_op.h"
 #include "paddle/operators/net_op.h"
@@ -38,7 +38,7 @@ class ScaleOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) Input tensor of scale operator.");
     AddOutput("Out", "(Tensor) Output tensor of scale operator.");
@@ -58,13 +58,13 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
  public:
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("scale");
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttr("scale", GetAttr("scale"));
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
@@ -75,6 +75,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
                   ops::ScaleGradMaker);
-REGISTER_OP_CPU_KERNEL(scale,
-                       ops::ScaleKernel<paddle::platform::CPUPlace, float>,
-                       ops::ScaleKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/scale_op.cu b/paddle/operators/scale_op.cu
index 820fd4e6855bb192ec3292ea6983d5ecae73b6e6..7202c0de707ff0b0b3ad966d9d1d3a7c0a89e880 100644
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
@@ -1,19 +1,23 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/scale_op.h"
 
-REGISTER_OP_GPU_KERNEL(
-    scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>,
-    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    scale,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
+                                   int64_t>);
diff --git a/paddle/operators/scale_op.h b/paddle/operators/scale_op.h
index 4931294c9d3661f4c53798bd0895a5cd38ae4501..395268c2eee40c187f5d211317ca8b28d35a71e0 100644
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -19,7 +19,7 @@
 
 namespace paddle {
 namespace operators {
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ScaleKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& context) const {
@@ -31,7 +31,8 @@ class ScaleKernel : public framework::OpKernel<T> {
 
     auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
     auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& dev = context.GetEigenDevice<Place>();
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
     eigen_out.device(dev) = scale * eigen_in;
   }
 };
diff --git a/paddle/operators/scatter.cu.h b/paddle/operators/scatter.cu.h
index d95436be4f25b9df4aaef57ddb249ecf944f0666..55555300fc3219c0651583d8540b47189c8d3f13 100644
--- a/paddle/operators/scatter.cu.h
+++ b/paddle/operators/scatter.cu.h
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/tensor.h"
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index ce4b794bc35aca0912d89a4ae81a9aa0c73a2104..b65334890633f54e70179bfa8fe5463901f7947e 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -49,7 +49,7 @@ class ScatterOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
@@ -68,7 +68,7 @@ class ScatterGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
@@ -78,8 +78,7 @@ class ScatterGradOp : public framework::OperatorWithKernel {
 
 class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ScatterOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Ref", "The source input of scatter op");
     AddInput("Index",
@@ -87,10 +86,15 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Updates", "The updated value of updates op");
     AddOutput("Out", "The output of add op");
     AddComment(R"DOC(
-Scatter Operator by selecting from the first axis,
+Scatter Operator.
 
-Out = Ref
+This operator obtains output by updating the input on selected indices on the first axis:
+
+$$
+Out = Ref \\
 Out[Index] = Ref[Index] + Updates
+$$
+
 )DOC");
   }
 };
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
index 3b32ae2fb77a5d3d4c558742ec469c74d15eee07..0c198d225890882ab6697d3a8b3d17e034c06cc4 100644
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "gather.cu.h"
 #include "paddle/operators/gather_op.h"
@@ -59,5 +59,5 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>);
-REGISTER_OP_GPU_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel<float>);
diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7c81a9524d6609a65b3167d95053bf4e85eef0db
--- /dev/null
+++ b/paddle/operators/send_op.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ostream>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+
+#include <future>
+#include "paddle/operators/detail/grpc_client.h"
+
+namespace paddle {
+namespace operators {
+
+class SendOp : public framework::OperatorBase {
+ public:
+  SendOp(const std::string& type, const framework::VariableNameMap& inputs,
+         const framework::VariableNameMap& outputs,
+         const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override {
+    auto ins = Inputs("X");
+    auto outs = Outputs("Out");
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+    for (size_t i = 0; i < ins.size(); i++) {
+      client_.AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+    }
+
+    for (size_t i = 0; i < outs.size(); i++) {
+      client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+    }
+
+    PADDLE_ENFORCE(client_.Wait());
+  }
+
+ private:
+  mutable detail::RPCClient client_;
+};
+
+class SendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SendOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor to be send").AsDuplicable();
+    AddOutput("Out", "(Tensor) Output tensor to get from server")
+        .AsDuplicable();
+    AddComment(R"DOC(
+Recv operator
+
+This operator will send tensor to recv_op.
+)DOC");
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>("epmap",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints in the order of input "
+                                      "variables for mapping")
+        .SetDefault({});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send, ops::SendOp, ops::SendOpMaker);
diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea091694798475dfd9631910a750405be950c20c
--- /dev/null
+++ b/paddle/operators/send_recv_op_test.cc
@@ -0,0 +1,209 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>
+
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/string/printf.h"
+
+USE_NO_KERNEL_OP(send);
+USE_NO_KERNEL_OP(recv);
+USE_OP(sum);
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+// global for simplicity.
+std::unique_ptr<f::OperatorBase> recv_op;
+
+void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
+  p::CPUDeviceContext ctx(place);
+  for (int i = 0; i < 2; ++i) {
+    auto var_name = paddle::string::Sprintf("x%d", i);
+    auto var = scope.Var(var_name);
+    auto tensor = var->GetMutable<f::LoDTensor>();
+    tensor->Resize({10, 10});
+    float *expect = tensor->mutable_data<float>(place);
+    for (int64_t i = 0; i < tensor->numel(); ++i) {
+      expect[i] = static_cast<float>(i);
+    }
+  }
+
+  auto out_var = scope.Var("Out");
+  auto out_tensor = out_var->GetMutable<f::LoDTensor>();
+  out_tensor->Resize({10, 10});
+  out_tensor->mutable_data<float>(place);  // allocate
+}
+
+void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
+  p::CPUDeviceContext ctx(place);
+  int64_t height = 10;
+  int64_t row_numel = 10;
+  m::SetConstant<p::CPUDeviceContext, float> set_one;
+  // init x0
+  std::vector<int64_t> rows0{0, 4, 7};
+  auto x0_var = scope.Var("x0");
+  auto x0 = x0_var->GetMutable<f::SelectedRows>();
+  x0->set_rows(rows0);
+  x0->set_height(height);
+  auto x0_value = x0->mutable_value();
+  x0_value->mutable_data<float>(
+      f::make_ddim({static_cast<int64_t>(rows0.size()), row_numel}), place);
+  set_one(ctx, x0_value, 1.0);
+
+  // init x1
+  std::vector<int64_t> rows1{2, 9};
+  auto x1_var = scope.Var("x1");
+  auto x1 = x1_var->GetMutable<f::SelectedRows>();
+  x1->set_rows(rows1);
+  x1->set_height(height);
+  auto x1_value = x1->mutable_value();
+  x1_value->mutable_data<float>(
+      f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place);
+  set_one(ctx, x1_value, 1.0);
+
+  auto out_var = scope.Var("Out");
+  auto out = out_var->GetMutable<f::SelectedRows>();
+  auto out_value = out->mutable_value();
+  out->set_height(height);
+  out_value->mutable_data<float>(f::make_ddim({5, 10}), place);
+}
+
+void AddOp(const std::string &type, const f::VariableNameMap &inputs,
+           const f::VariableNameMap &outputs, f::AttributeMap attrs,
+           f::BlockDesc *block) {
+  // insert output
+  for (auto kv : outputs) {
+    for (auto v : kv.second) {
+      auto var = block->Var(v);
+      var->SetDataType(f::proto::DataType::FP32);
+    }
+  }
+
+  // insert op
+  auto op = block->AppendOp();
+  op->SetType(type);
+  for (auto &kv : inputs) {
+    op->SetInput(kv.first, kv.second);
+  }
+  for (auto &kv : outputs) {
+    op->SetOutput(kv.first, kv.second);
+  }
+  op->SetAttrMap(attrs);
+}
+
+void StartServerNet(bool is_sparse) {
+  f::Scope scope;
+  p::CPUPlace place;
+  if (is_sparse) {
+    InitSelectedRowsInScope(scope, place);
+  } else {
+    InitTensorsInScope(scope, place);
+  }
+
+  // sub program run in recv_op, for simple test we use sum
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  // X for server side tensors, RX for received tensers, must be of same shape.
+  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, block);
+
+  f::AttributeMap attrs;
+  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
+  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
+  std::string program_proto;
+  PADDLE_ENFORCE(program.Proto()->SerializeToString(&program_proto));
+
+  attrs.insert({"OptimizeProgram", program_proto});
+  recv_op = f::OpRegistry::CreateOp("recv", {{"RX", {"x1"}}}, {}, attrs);
+  recv_op->Run(scope, place);
+}
+
+TEST(SendRecvOp, CPUDense) {
+  std::thread server_thread(StartServerNet, false);
+  sleep(10);  // wait server to start
+  // local net
+  f::Scope scope;
+  p::CPUPlace place;
+  InitTensorsInScope(scope, place);
+
+  f::AttributeMap attrs;
+  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
+  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
+  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
+                                         {{"Out", {"Out"}}}, attrs);
+  send_op->Run(scope, place);
+
+  auto in_var = scope.Var("x1");
+  auto tensor = in_var->GetMutable<f::LoDTensor>();
+  float *expected = tensor->data<float>();
+  auto out_var = scope.Var("Out");
+  auto target = out_var->GetMutable<f::LoDTensor>();
+  // x1 * 2 == x0
+  EXPECT_NE(target->memory_size(), size_t(0));
+  float *actual = target->data<float>();
+  for (int64_t i = 0; i < target->numel(); ++i) {
+    EXPECT_EQ(expected[i] * 2, actual[i]);
+  }
+  recv_op->Stop();
+  server_thread.join();
+  recv_op.reset(nullptr);
+}
+
+TEST(SendRecvOp, CPUSparse) {
+  std::thread server_thread(StartServerNet, true);
+  sleep(3);  // wait server to start
+  // local net
+  f::Scope scope;
+  p::CPUPlace place;
+  p::CPUDeviceContext ctx(place);
+  InitSelectedRowsInScope(scope, place);
+  f::AttributeMap attrs;
+  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
+  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
+  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
+                                         {{"Out", {"Out"}}}, attrs);
+  send_op->Run(scope, place);
+
+  auto x0 = scope.Var("x0")->GetMutable<f::SelectedRows>();
+  auto x1 = scope.Var("x1")->GetMutable<f::SelectedRows>();
+  auto out = scope.Var("Out")->GetMutable<f::SelectedRows>();
+  auto actual = out->mutable_value();
+
+  std::unique_ptr<f::SelectedRows> expect{new f::SelectedRows()};
+  auto expect_value = expect->mutable_value();
+  expect_value->mutable_data<float>(f::make_ddim({5, 10}), place);
+
+  m::SelectedRowsAdd<p::CPUDeviceContext, float> add_functor;
+  add_functor(ctx, *x0, *x1, expect.get());
+
+  EXPECT_EQ(actual->numel(), expect_value->numel());
+  EXPECT_EQ(out->rows().size(), x0->rows().size() + x1->rows().size());
+
+  for (int64_t i = 0; i < expect_value->numel(); ++i) {
+    EXPECT_EQ(expect_value->mutable_data<float>(place)[i],
+              actual->mutable_data<float>(place)[i]);
+  }
+  recv_op->Stop();
+  server_thread.join();
+  recv_op.reset();
+}
diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc
index d1de0b444712a8c304c33bd194e306dfe3c41f02..2f0aad2003e48952ca26ca27573bc45386a4e585 100644
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -43,8 +43,7 @@ class SequenceConcatOp : public framework::OperatorWithKernel {
 
 class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceConcatOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  SequenceConcatOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(LodTensorArray) Input is a vector of LoDTensor, "
@@ -68,12 +67,12 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The level should be less than the level number of inputs.")
         .SetDefault(0);
     AddComment(R"DOC(
-The sequence_concat operator concatenates multiple LoDTensors. 
-It only supports sequence (LoD Tensor with level number is 1) 
+The sequence_concat operator concatenates multiple LoDTensors.
+It only supports sequence (LoD Tensor with level number is 1)
 or a nested sequence (LoD tensor with level number is 2) as its input.
 - Case1:
   If the axis is other than 0(here, axis is 1 and level is 1),
-  each input should have the same LoD information and the LoD 
+  each input should have the same LoD information and the LoD
   information of the output keeps the same as the input.
 
   LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
@@ -81,7 +80,7 @@ or a nested sequence (LoD tensor with level number is 2) as its input.
   LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4)
 
 - Case2:
-  If the axis is 0(here, leve is 0), the inputs are concatenated along 
+  If the axis is 0(here, leve is 0), the inputs are concatenated along
   time steps, the LoD information of the output need to re-compute.
   The LoD information of level-1 should be same.
 
@@ -125,11 +124,12 @@ class SequenceConcatGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_concat, ops::SequenceConcatOp, ops::SequenceConcatOpMaker,
-            sequence_concat_grad, ops::SequenceConcatGradOp);
+REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp,
+               ops::SequenceConcatOpMaker, sequence_concat_grad,
+               ops::SequenceConcatGradOp, false);
 REGISTER_OP_CPU_KERNEL(
     sequence_concat,
-    ops::SequenceConcatOpKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     sequence_concat_grad,
-    ops::SequenceConcatGradOpKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceConcatGradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_concat_op.cu.cc b/paddle/operators/sequence_concat_op.cu.cc
index 9ca99c2258f547e6f9c23be0d394bc3ea2bb6678..144bdb5af635b0cb75bcd1f654700041186dae46 100644
--- a/paddle/operators/sequence_concat_op.cu.cc
+++ b/paddle/operators/sequence_concat_op.cu.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/operators/sequence_concat_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     sequence_concat,
-    ops::SequenceConcatOpKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    sequence_concat_grad,
-    ops::SequenceConcatGradOpKernel<paddle::platform::GPUPlace, float>);
+    ops::SequenceConcatOpKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(sequence_concat_grad,
+                        ops::SequenceConcatGradOpKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_concat_op.h b/paddle/operators/sequence_concat_op.h
index 09212070aa90b0f080f6140a312924229162aaec..8445224f46aba6110280783c9080ed4691266b8b 100644
--- a/paddle/operators/sequence_concat_op.h
+++ b/paddle/operators/sequence_concat_op.h
@@ -59,7 +59,7 @@ LoD ConcatLoD(const std::vector<const T*> ins, const size_t level) {
   return out_lod;
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceConcatOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -119,7 +119,7 @@ class SequenceConcatOpKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceConcatGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/operators/sequence_conv_op.cc
index 41cadce4c603a9c14db79e2f6b30f8664cf72a38..c5b7c81bd7c6e1110aa9e2ced629bea5d88832d1 100644
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
@@ -100,8 +100,7 @@ class SequenceConvGradOp : public framework::OperatorWithKernel {
 
 class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceConvOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  SequenceConvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
@@ -179,7 +178,10 @@ REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
             sequence_conv_grad, ops::SequenceConvGradOp);
 
 REGISTER_OP_CPU_KERNEL(
-    sequence_conv, ops::SequenceConvKernel<paddle::platform::CPUPlace, float>);
+    sequence_conv,
+    ops::SequenceConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceConvKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceConvGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/sequence_conv_op.cu.cc b/paddle/operators/sequence_conv_op.cu.cc
index 6106b0e46c0ab96e01dfc344055f23dbf4a1a2c3..0b8f2c695564f19cf71ecc56a60e707c3703af36 100644
--- a/paddle/operators/sequence_conv_op.cu.cc
+++ b/paddle/operators/sequence_conv_op.cu.cc
@@ -1,22 +1,25 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/sequence_conv_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    sequence_conv, ops::SequenceConvKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    sequence_conv,
+    ops::SequenceConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
     sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SequenceConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/sequence_conv_op.h b/paddle/operators/sequence_conv_op.h
index b8fbe2647c4338a2fa16aa655ebab64dd8d5417d..bb584b7bfa5fb8f6eb0a452468d24ca034be6f1b 100644
--- a/paddle/operators/sequence_conv_op.h
+++ b/paddle/operators/sequence_conv_op.h
@@ -23,7 +23,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceConvKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -56,21 +56,23 @@ class SequenceConvKernel : public framework::OpKernel<T> {
     Tensor col;
     col.mutable_data<T>(col_shape, context.GetPlace());
     // Because if padding_trainable is false, padding data should be zeros.
-    math::SetConstant<Place, T> set_zero;
-    set_zero(context.device_context(), &col, static_cast<T>(0));
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    set_zero(dev_ctx, &col, static_cast<T>(0));
 
-    math::ContextProjectFunctor<Place, T> seq_project_functor;
+    math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
 
-    seq_project_functor(context.device_context(), *in, *padding_data,
-                        padding_trainable, context_start, context_length,
-                        context_stride, up_pad, down_pad, &col);
+    seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
+                        context_start, context_length, context_stride, up_pad,
+                        down_pad, &col);
 
-    math::matmul<Place, T>(context.device_context(), col, false, filter, false,
-                           static_cast<T>(1.0), out, static_cast<T>(0.0));
+    math::matmul<DeviceContext, T>(dev_ctx, col, false, filter, false,
+                                   static_cast<T>(1.0), out,
+                                   static_cast<T>(0.0));
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceConvGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -95,7 +97,8 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
     int down_pad = std::max(0, context_start + context_length - 1);
     int sequence_width = static_cast<int>(in->dims()[1]);
 
-    math::SetConstant<Place, T> set_zero;
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     // use col_shape in the im2col calculation
     framework::DDim col_shape = {in->dims()[0],
                                  sequence_width * context_length};
@@ -104,38 +107,36 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
     if (in_g || filter_g || (padding_trainable && padding_data_g)) {
       col.mutable_data<T>(col_shape, context.GetPlace());
       // Because if padding_trainable is false, padding data should be zeros.
-      set_zero(context.device_context(), &col, static_cast<T>(0));
-      math::matmul<Place, T>(context.device_context(), *out_g, false, *filter,
-                             true, T(1.0), &col, T(1.0));
+      set_zero(dev_ctx, &col, static_cast<T>(0));
+      math::matmul<DeviceContext, T>(dev_ctx, *out_g, false, *filter, true,
+                                     T(1.0), &col, T(1.0));
     }
-    math::ContextProjectFunctor<Place, T> seq_project_functor;
-    math::ContextProjectGradFunctor<Place, T> seq_project_grad_functor;
+    math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
+    math::ContextProjectGradFunctor<DeviceContext, T> seq_project_grad_functor;
 
     if (in_g) {
       in_g->mutable_data<T>(context.GetPlace());
       in_g->set_lod(in->lod());
-      set_zero(context.device_context(), in_g, static_cast<T>(0));
+      set_zero(dev_ctx, in_g, static_cast<T>(0));
 
-      seq_project_grad_functor(context.device_context(), *in_g,
-                               padding_trainable, context_start, context_length,
-                               context_stride, up_pad, down_pad, false, true,
-                               padding_data_g, &col);
+      seq_project_grad_functor(dev_ctx, *in_g, padding_trainable, context_start,
+                               context_length, context_stride, up_pad, down_pad,
+                               false, true, padding_data_g, &col);
     }
 
     if (padding_trainable && padding_data_g) {
       padding_data_g->mutable_data<T>(context.GetPlace());
-      set_zero(context.device_context(), padding_data_g, static_cast<T>(0));
+      set_zero(dev_ctx, padding_data_g, static_cast<T>(0));
 
       LoDTensor* input = const_cast<LoDTensor*>(in);
-      seq_project_grad_functor(context.device_context(), *input,
-                               padding_trainable, context_start, context_length,
-                               context_stride, up_pad, down_pad, true, false,
-                               padding_data_g, &col);
+      seq_project_grad_functor(
+          dev_ctx, *input, padding_trainable, context_start, context_length,
+          context_stride, up_pad, down_pad, true, false, padding_data_g, &col);
     }
 
     if (filter_g) {
       filter_g->mutable_data<T>(context.GetPlace());
-      set_zero(context.device_context(), filter_g, static_cast<T>(0));
+      set_zero(dev_ctx, filter_g, static_cast<T>(0));
 
       Tensor filter_grad = *filter_g;
       LoDTensor out_grad = *out_g;
@@ -145,12 +146,12 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
         padding_data = context.Input<Tensor>("PaddingData");
       }
 
-      seq_project_functor(context.device_context(), *in, *padding_data,
-                          padding_trainable, context_start, context_length,
-                          context_stride, up_pad, down_pad, &col);
+      seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
+                          context_start, context_length, context_stride, up_pad,
+                          down_pad, &col);
 
-      math::matmul<Place, T>(context.device_context(), col, true, out_grad,
-                             false, T(1.0), &filter_grad, T(1.0));
+      math::matmul<DeviceContext, T>(dev_ctx, col, true, out_grad, false,
+                                     T(1.0), &filter_grad, T(1.0));
     }
   }
 };
diff --git a/paddle/operators/sequence_erase_op.cc b/paddle/operators/sequence_erase_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d17b2686238b2d2f872331edfdbb095fb8693b87
--- /dev/null
+++ b/paddle/operators/sequence_erase_op.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_erase_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceEraseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceEraseOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceEraseOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(x_dims.size() == 2 && x_dims[1] == 1,
+                   "Input(X) of SequenceEraseOp should be a 2-D LoDTensor "
+                   "with the 2nd dimension equal to 1.");
+    ctx->SetOutputDim("Out", x_dims);
+  }
+};
+
+class SequenceEraseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceEraseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(2-D LoDTensor with the 2nd dim. equal to 1) "
+             "Input LoDTensor of SequenceEraseOp.");
+    AddOutput("Out",
+              "(2-D LoDTensor with the 2nd dim. equal to 1) "
+              "Output LoDTensor of SequenceEraseOp.");
+    AddAttr<std::vector<int>>("tokens",
+                              "(vector<int>) Tokens need to be erased from "
+                              "input sequences.");
+    AddComment(R"DOC(
+Sequence Erase Operator.
+
+Sequence erase operator erases tokens specified by Attr(tokens) from the input 
+sequences Input(X), and outputs the remaining data and modifies the LoD 
+information at the same time. For example, given a 2-D LoDTensor
+
+    X = [[2, 2, 6, 1, 3, 9, 6, 1, 0, 1]]^T
+
+with lod = [[0, 3, 6, 10]], there are three sequences in the input:
+   
+     X1 = [[2, 2, 6]]^T, X2 = [[1, 3, 9]]^T and X3 = [[6, 1, 0, 1]]^T.
+
+If the tokens to be erased are Attr(tokens) = [2, 3, 5], after the erasing 
+operation, the three sequences become
+
+    X1' = [[6]]^T, X2' = [[1, 9]]^T and X3' = [[6, 1, 0, 1]]^T.
+
+Hence the LoDTensor Output(Out) should be
+
+    Out = [[6, 1, 9, 6, 1, 0, 1]]^T,
+
+with lod = [[0, 1, 3, 7]].
+
+An example usage for this operator is to remove the special tokens when 
+computing the edit distance between two strings, such as blank, start token, 
+and end token.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(sequence_erase, ops::SequenceEraseOp,
+                             ops::SequenceEraseOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    sequence_erase,
+    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int32_t>);
diff --git a/paddle/operators/sequence_erase_op.cu b/paddle/operators/sequence_erase_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5da8eba3e1ac1fb85dfc65c2fd801574599e02d9
--- /dev/null
+++ b/paddle/operators/sequence_erase_op.cu
@@ -0,0 +1,133 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include "paddle/operators/sequence_erase_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+__global__ void LabelErasedIdx(const T* in_dat, const int in_len,
+                               const T* tokens, const int tokens_len,
+                               int* num_erased) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < in_len) {
+    int erased = 0;
+    for (int i = 0; i < tokens_len; ++i) {
+      if (in_dat[index] == tokens[i]) {
+        erased = 1;
+      }
+    }
+    num_erased[index + 1] = erased;
+    if (index == 0) {
+      num_erased[0] = 0;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GetOutLod(const T* num_erased, const int* in_lod,
+                          const int lod_len, int* out_lod0) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < lod_len) {
+    out_lod0[index] = in_lod[index] - num_erased[in_lod[index]];
+  }
+}
+
+template <typename T>
+__global__ void SetOutput(const T* in_dat, const int in_len,
+                          const int* num_erased, T* out_dat) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < in_len) {
+    if (in_dat[index] != in_dat[index + 1]) {
+      out_dat[index - num_erased[index]] = in_dat[index];
+    }
+  }
+}
+
+template <typename T>
+class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+
+    auto lod = in->lod();
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
+                      "The actual size mismatches with the LoD information.");
+    auto tokens = ctx.Attr<std::vector<T>>("tokens");
+    auto tokens_len = tokens.size();
+    auto in_len = in->numel();
+    auto in_dat = in->data<T>();
+    auto lod0 = lod[0];
+
+    thrust::host_vector<T> host_tokens(tokens_len);
+    for (size_t i = 0; i < tokens.size(); ++i) {
+      host_tokens[i] = tokens[i];
+    }
+    thrust::device_vector<T> dev_tokens = host_tokens;
+    thrust::device_vector<int> num_erased(in_len + 1);
+
+    T* dev_tokens_ptr = thrust::raw_pointer_cast(dev_tokens.data());
+    int* num_erased_ptr = thrust::raw_pointer_cast(num_erased.data());
+
+    auto stream = ctx.cuda_device_context().stream();
+    LabelErasedIdx<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                     PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        in_dat, in_len, dev_tokens_ptr, tokens_len, num_erased_ptr);
+    thrust::inclusive_scan(num_erased.begin() + 1, num_erased.end(),
+                           num_erased.begin() + 1);
+
+    // Calc LoD
+    auto lod_len = lod0.size();
+    thrust::host_vector<int> host_lod(lod_len);
+    for (size_t i = 0; i < lod_len; ++i) {
+      host_lod[i] = lod0[i];
+    }
+    thrust::device_vector<int> dev_in_lod = host_lod;
+    thrust::device_vector<int> dev_out_lod(lod_len);
+    int* dev_in_lod_ptr = thrust::raw_pointer_cast(dev_in_lod.data());
+    int* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data());
+    GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
+    thrust::host_vector<int> host_out_lod = dev_out_lod;
+    std::vector<int> out_lod0(lod_len, 0);
+    for (size_t i = 0; i < lod_len; i++) {
+      out_lod0[i] = host_out_lod[i];
+    }
+    framework::LoD out_lod;
+    out_lod.push_back(out_lod0);
+    out->set_lod(out_lod);
+
+    // Set output
+    out->Resize({out_lod0.back(), 1});
+    auto out_dat = out->mutable_data<T>(ctx.GetPlace());
+    SetOutput<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_dat, in_len,
+                                                      num_erased_ptr, out_dat);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(sequence_erase,
+                        paddle::operators::SequenceEraseOpCUDAKernel<int32_t>);
diff --git a/paddle/operators/sequence_erase_op.h b/paddle/operators/sequence_erase_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb2d7be009dcbe0138818457249e95fbdd27fc0a
--- /dev/null
+++ b/paddle/operators/sequence_erase_op.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SequenceEraseKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    auto lod = in->lod();
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
+                      "The actual size mismatches with the LoD information.");
+    auto tokens = ctx.Attr<std::vector<int>>("tokens");
+    auto in_len = in->numel();
+    auto in_dat = in->data<T>();
+    auto lod0 = lod[0];
+
+    std::vector<size_t> num_erased(in_len + 1, 0);
+    std::vector<size_t> out_lod0(1, 0);
+    for (size_t i = 0; i < lod0.size() - 1; ++i) {
+      size_t num_out = 0;
+      for (auto j = lod0[i] + 1; j <= lod0[i + 1]; ++j) {
+        num_erased[j] = num_erased[j - 1];
+        if (std::find(tokens.begin(), tokens.end(), in_dat[j - 1]) !=
+            tokens.end()) {
+          num_erased[j] += 1;
+        } else {
+          num_out += 1;
+        }
+      }
+      out_lod0.push_back(out_lod0.back() + num_out);
+    }
+
+    auto out_len = in_len - num_erased[in_len];
+    out->Resize({static_cast<int64_t>(out_len), 1});
+    auto out_dat = out->mutable_data<T>(ctx.GetPlace());
+
+    for (int64_t i = 0; i < in_len; ++i) {
+      if (num_erased[i] == num_erased[i + 1]) {
+        out_dat[i - num_erased[i]] = in_dat[i];
+      }
+    }
+    framework::LoD out_lod;
+    out_lod.push_back(out_lod0);
+    out->set_lod(out_lod);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/sequence_expand_op.cc
similarity index 72%
rename from paddle/operators/seq_expand_op.cc
rename to paddle/operators/sequence_expand_op.cc
index b862056ad400290a60e8a75a23dceeb1d4422ea4..b40ec617e42110e0ab5168a8ac675adaf760fb3c 100644
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/sequence_expand_op.cc
@@ -1,25 +1,25 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
-#include "paddle/operators/seq_expand_op.h"
+#include "paddle/operators/sequence_expand_op.h"
 
 namespace paddle {
 namespace operators {
 
 using framework::Tensor;
 
-class SeqExpandOp : public framework::OperatorWithKernel {
+class SequenceExpandOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -35,25 +35,24 @@ class SeqExpandOp : public framework::OperatorWithKernel {
   }
 };
 
-class SeqExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SeqExpandOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  SequenceExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor or LoDTensor) The input(X) of this operator can be a "
              "LoDTensor or a base Tensor.");
     AddInput("Y",
-             "(LoDTensor)The reference input(Y) of seq_expand op."
+             "(LoDTensor)The reference input(Y) of sequence_expand op."
              "It must be a LoDTensor with k-level(k>0)."
              "The input(X) will be expanded according to LOD of input(Y)."
              "The element numbers of last level in input(Y) "
              "must be equal to dims[0] of input(X).");
     AddOutput("Out",
-              "(LodTensor)The output of seq_expand op."
+              "(LodTensor)The output of sequence_expand op."
               "The lod of output will be as same as input(Y)'s lod.");
     AddComment(R"DOC(
-Seq Expand Operator.
+Sequence Expand Operator.
 
 This operator expands input(X) according to LOD of input(Y).
 Following are cases to better explain how this works:
@@ -124,7 +123,7 @@ then we get 2-level LoDTensor
   }
 };
 
-class SeqExpandOpGrad : public framework::OperatorWithKernel {
+class SequenceExpandOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -146,10 +145,11 @@ class SeqExpandOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(seq_expand, ops::SeqExpandOp, ops::SeqExpandOpMaker,
-            seq_expand_grad, ops::SeqExpandOpGrad);
-REGISTER_OP_CPU_KERNEL(seq_expand,
-                       ops::SeqExpandKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker,
+            sequence_expand_grad, ops::SequenceExpandOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    seq_expand_grad,
-    ops::SeqExpandGradKernel<paddle::platform::CPUPlace, float>);
+    sequence_expand,
+    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_expand_grad,
+    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_expand_op.cu b/paddle/operators/sequence_expand_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0b9638b2ce60f73c95e1ccbcfb16cef7b5351073
--- /dev/null
+++ b/paddle/operators/sequence_expand_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/sequence_expand_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_expand,
+    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_expand_grad,
+    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/sequence_expand_op.h
similarity index 73%
rename from paddle/operators/seq_expand_op.h
rename to paddle/operators/sequence_expand_op.h
index 4ef0d02cf85c43e95335660be65a67df66b4f55c..2ba628e9c37278025e31779ab0468db46f2ff40a 100644
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/sequence_expand_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -23,8 +23,8 @@ namespace operators {
 
 using LoDTensor = framework::LoDTensor;
 
-template <typename Place, typename T>
-class SeqExpandKernel : public framework::OpKernel<T> {
+template <typename DeviceContext, typename T>
+class SequenceExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<LoDTensor>("X");
@@ -37,7 +37,8 @@ class SeqExpandKernel : public framework::OpKernel<T> {
                       "The size of last lod level in Input(Y)"
                       "must be equal to dims[0] of Input(X).");
     out->set_lod(y->lod());
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
     size_t element_len = framework::product(x_dims) / x_dims[0];
     T* out_data = out->mutable_data<T>(context.GetPlace());
     auto out_starts = out->lod().back();
@@ -50,7 +51,7 @@ class SeqExpandKernel : public framework::OpKernel<T> {
       Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
           out_t(out_data, scale, element_len);
       Eigen::array<int, 2> cast({{scale, 1}});
-      out_t.device(place) = x_t.broadcast(cast);
+      out_t.device(*place) = x_t.broadcast(cast);
       x_data += element_len;
       out_data += element_len * scale;
     }
@@ -69,8 +70,8 @@ class SeqExpandKernel : public framework::OpKernel<T> {
  *    Grad(X).lod = Input(X).lod
  *
  * */
-template <typename Place, typename T>
-class SeqExpandGradKernel : public framework::OpKernel<T> {
+template <typename DeviceContext, typename T>
+class SequenceExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
@@ -89,8 +90,9 @@ class SeqExpandGradKernel : public framework::OpKernel<T> {
       d_out_t(d_out_data, static_cast<int>(repeat), element_len);
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>
       d_x_t(d_x_data, static_cast<int>(element_len));
-      auto place = context.GetEigenDevice<Place>();
-      d_x_t.device(place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
+      auto place =
+          context.template device_context<DeviceContext>().eigen_device();
+      d_x_t.device(*place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
       d_out_data += (repeat * element_len);
       d_x_data += element_len;
     }
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
index 2a000ac60b176737277605c3ac812ea65a0e03fc..549d9620ef20e2be7d030f41f1e7567cb1922f3b 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -37,8 +37,7 @@ class SequencePoolOp : public framework::OperatorWithKernel {
 
 class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequencePoolOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  SequencePoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
     AddOutput("Out",
@@ -50,7 +49,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsIntermediate();
     AddAttr<std::string>(
         "pooltype",
-        "(int, default AVERAGE) the pooling pooltype of SequencePoolOp.")
+        "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")
         .SetDefault("AVERAGE")
         .InEnum({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"});
     AddComment(R"DOC(
@@ -58,12 +57,12 @@ Sequence Pool Operator.
 
 The SequencePoolOp pools features of all time-steps of each instance.
 It supports six pooling types:
-1. AVERAGE: Out[i] = $$avg(X_i)$$
-2. SUM:     Out[i] = $$\sum_jX_{ij}$$
-3. SQRT:    Out[i] = $$\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$
+1. AVERAGE: $$Out[i] = \frac{\sum_i X_i}{N}$$
+2. SUM:     $$Out[i] = \sum_jX_{ij}$$
+3. SQRT:    $$Out[i] = \frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$
 4. LAST:    Out[i] = last instance in i-th sequence X[i]
 5. FIRST:   Out[i] = first instance in i-th sequence X[i]
-6. MAX:     Out[i] = $$max(X_i)$$
+6. MAX:     $$Out[i] = max(X_i)$$
 
 The following example explains how this works:
 For a mini-batch of 3 variable-length sentences,
@@ -104,10 +103,11 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch.");
     }
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
@@ -115,14 +115,35 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class SequencePoolGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType("sequence_pool_grad");
+    op_desc_ptr->SetInput("X", Input("X"));
+    if (boost::get<std::string>(GetAttr("pooltype")) == "MAX") {
+      op_desc_ptr->SetInput("MaxIndex", Output("MaxIndex"));
+    }
+    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op_desc_ptr->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker,
-            sequence_pool_grad, ops::SequencePoolGradOp);
+REGISTER_OPERATOR(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker,
+                  ops::SequencePoolGradOpMaker);
+REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp);
 REGISTER_OP_CPU_KERNEL(
-    sequence_pool, ops::SequencePoolKernel<paddle::platform::CPUPlace, float>);
+    sequence_pool,
+    ops::SequencePoolKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     sequence_pool_grad,
-    ops::SequencePoolGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SequencePoolGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_pool_op.cu b/paddle/operators/sequence_pool_op.cu
index 66850772d501f873cf754205c19e9d0c0090370a..265f695935236236f98c2dd2062072756e9c8b14 100644
--- a/paddle/operators/sequence_pool_op.cu
+++ b/paddle/operators/sequence_pool_op.cu
@@ -1,24 +1,25 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 
 #include "paddle/operators/sequence_pool_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    sequence_pool, ops::SequencePoolKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    sequence_pool,
+    ops::SequencePoolKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     sequence_pool_grad,
-    ops::SequencePoolGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SequencePoolGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
index 7f136d8cf0e1eaae7b4de32988b60ae8a5034cc6..7519aa1d7208b9832f7a3d3afbc59a2eb4e8e13a 100644
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -30,7 +30,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequencePoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -54,17 +54,18 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     auto lod_level_0 = lod[0];
 
     out->mutable_data<T>(context.GetPlace());
-
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (pooltype == "MAX") {
-      math::MaxSeqPoolFunctor<Place, T> max_pool;
+      math::MaxSeqPoolFunctor<DeviceContext, T> max_pool;
       auto* index = context.Output<Tensor>("MaxIndex");
       index->Resize({dims});
       index->mutable_data<int>(context.GetPlace());
-      max_pool(context.device_context(), *in, out, index);
+      max_pool(dev_ctx, *in, out, index);
       return;
     }
 
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
       Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
                               static_cast<int>(lod_level_0[i + 1]));
@@ -91,7 +92,7 @@ class SequencePoolKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequencePoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -105,20 +106,23 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
     int64_t w = in->numel() / dims[0];
 
     in_g->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
 
     if (pooltype == "MAX") {
-      math::MaxSeqPoolGradFunctor<Place, T> max_pool_grad;
+      math::MaxSeqPoolGradFunctor<DeviceContext, T> max_pool_grad;
       auto* index = context.Input<Tensor>("MaxIndex");
-      max_pool_grad(context.device_context(), *out_g, *index, in_g);
+      max_pool_grad(dev_ctx, *out_g, *index, in_g);
       return;
     }
 
     if (pooltype == "LAST" || pooltype == "FIRST") {
       // set X@Grad be zero at first when pooltype is LAST/FIRST
-      math::SetConstant<Place, T> functor;
-      functor(context.device_context(), in_g, 0);
+      math::SetConstant<DeviceContext, T> functor;
+      functor(dev_ctx, in_g, 0);
     }
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
       auto in_g_t =
           in_g->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc
old mode 100755
new mode 100644
index cbe0b4233160dd1f3ebdf6db8b5f6df392efdfe7..f79106ff0f7a3d0918bc3a5c428179cb170ffc79
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
@@ -45,10 +45,10 @@ class SequenceSliceOp : public framework::OperatorWithKernel {
     // Initialize the output's dims to maximum,
     // and re-set to real dims by the value of Offset and Length at kernel
     ctx->SetOutputDim("Out", input_dims);
-    }
+  }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
@@ -69,7 +69,7 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
@@ -79,8 +79,7 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {
 
 class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceSliceOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+  SequenceSliceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(LoDTensor), "
@@ -93,8 +92,7 @@ class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor), "
              "a vector<int> to describe the length of every input sequence for "
              "sub sequence item.");
-    AddOutput("Out",
-              "(LoDTensor), the output of SequenceSliceOp.");
+    AddOutput("Out", "(LoDTensor), the output of SequenceSliceOp.");
     AddComment(R"DOC(
 Sequence slice operator
 
@@ -126,7 +124,7 @@ REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker,
             sequence_slice_grad, ops::SequenceSliceGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_slice,
-    ops::SequenceSliceOpKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     sequence_slice_grad,
-    ops::SequenceSliceGradOpKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceSliceGradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_slice_op.cu b/paddle/operators/sequence_slice_op.cu
index a9f59dadba74d900fa5cc0601fb5b264ea19e34d..43a21d619f4116874c329eb968f09dc230975c05 100755
--- a/paddle/operators/sequence_slice_op.cu
+++ b/paddle/operators/sequence_slice_op.cu
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/operators/sequence_slice_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     sequence_slice,
-    ops::SequenceSliceOpKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+    ops::SequenceSliceOpKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     sequence_slice_grad,
-    ops::SequenceSliceGradOpKernel<paddle::platform::GPUPlace, float>);
+    ops::SequenceSliceGradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h
old mode 100755
new mode 100644
index 2c9b8464a1236a054cf1a38b9dc1d73588f8dd38..0e4e4cf65fc69121f31886bb9909a87fea56a0be
--- a/paddle/operators/sequence_slice_op.h
+++ b/paddle/operators/sequence_slice_op.h
@@ -26,7 +26,7 @@ using LoD = framework::LoD;
 
 template <typename T>
 inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data,
-                           const int64_t* length_data) {
+                            const int64_t* length_data) {
   auto out_lod = in.lod();
   size_t lod_offset = 0;
 
@@ -34,12 +34,12 @@ inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data,
   out_lod[0][0] = 0;
   for (size_t i = 0; i < n; ++i) {
     lod_offset += length_data[i];
-    out_lod[0][i+1] = lod_offset;
+    out_lod[0][i + 1] = lod_offset;
   }
   return out_lod;
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceSliceOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -51,14 +51,13 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
     auto lod = in->lod();
     auto n = lod[0].size() - 1;
 
-    PADDLE_ENFORCE_EQ(lod.size(), 1UL,
-                      "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
     PADDLE_ENFORCE_EQ(
         n, static_cast<size_t>(length->dims()[0]),
-        "The size of input-sequence and length-array should be the same")
+        "The size of input-sequence and length-array should be the same");
     PADDLE_ENFORCE_EQ(
         n, static_cast<size_t>(offset->dims()[0]),
-        "The size of input-sequence and offset-array should be the same")
+        "The size of input-sequence and offset-array should be the same");
 
     const int64_t* offset_data = offset->data<int64_t>();
     const int64_t* length_data = length->data<int64_t>();
@@ -67,23 +66,23 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
       offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      offset_cpu.CopyFrom(*offset, platform::CPUPlace(), ctx.device_context());
+      framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(),
+                      &offset_cpu);
       offset_data = offset_cpu.data<int64_t>();
 
       length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      length_cpu.CopyFrom(*length, platform::CPUPlace(), ctx.device_context());
+      framework::Copy(*length, platform::CPUPlace(), ctx.device_context(),
+                      &length_cpu);
       length_data = length_cpu.data<int64_t>();
     }
 
     for (size_t i = 0; i < n; ++i) {
       PADDLE_ENFORCE_LT(0, offset_data[i],
-                "The offset[%d] must greater than zero.", i)
+                        "The offset[%d] must greater than zero.", i);
       PADDLE_ENFORCE_LT(0, length_data[i],
-                "The length[%d] must greater than zero.", i)
-      PADDLE_ENFORCE_LT(
-          lod[0][i] + offset_data[i] + length_data[i],
-          lod[0][i + 1],
-          "The target tensor's length overflow.")
+                        "The length[%d] must greater than zero.", i);
+      PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i],
+                        lod[0][i + 1], "The target tensor's length overflow.");
     }
 
     out->mutable_data<T>(ctx.GetPlace());
@@ -98,20 +97,18 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
 
     size_t out_offset = 0;
     for (size_t i = 0; i < n; ++i) {
-      Tensor in_t =
-          in->Slice(static_cast<int>(lod[0][i] + offset_data[i]),
-                    static_cast<int>(lod[0][i] + offset_data[i] +
-                                     length_data[i]));
-
-      StridedMemcpy<T>(ctx.device_context(), in_t.data<T>(),
-                       in_stride, in_t.dims(), out_stride,
-                       out->data<T>() + out_offset);
+      Tensor in_t = in->Slice(
+          static_cast<int>(lod[0][i] + offset_data[i]),
+          static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
+
+      StridedMemcpy<T>(ctx.device_context(), in_t.data<T>(), in_stride,
+                       in_t.dims(), out_stride, out->data<T>() + out_offset);
       out_offset += length_data[i] * in_stride[0];
     }
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -130,11 +127,13 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
       offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      offset_cpu.CopyFrom(*offset, platform::CPUPlace(), ctx.device_context());
+      framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(),
+                      &offset_cpu);
       offset_data = offset_cpu.data<int64_t>();
 
       length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      length_cpu.CopyFrom(*length, platform::CPUPlace(), ctx.device_context());
+      framework::Copy(*length, platform::CPUPlace(), ctx.device_context(),
+                      &length_cpu);
       length_data = length_cpu.data<int64_t>();
     }
 
@@ -144,8 +143,9 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
     if (x_grad) {
       x_grad->mutable_data<T>(ctx.GetPlace());
       x_grad->set_lod(in->lod());
-      math::SetConstant<Place, T> set_zero;
-      set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(ctx.template device_context<DeviceContext>(), x_grad,
+               static_cast<T>(0));
 
       auto out_grad_stride = framework::stride(out_grad->dims());
 
@@ -162,8 +162,8 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
             static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
 
         StridedMemcpy<T>(ctx.device_context(), out_grad_t.data<T>(),
-                        out_grad_stride, out_grad_t.dims(), x_grad_stride,
-                        x_grad_t.data<T>());
+                         out_grad_stride, out_grad_t.dims(), x_grad_stride,
+                         x_grad_t.data<T>());
       }
     }
   }
diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/operators/sequence_softmax_op.cc
index 32c15025660ebf0baf317e269a33c047e6844219..b74766f012e333cc2a317e6efe17c5b60238924a 100644
--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
@@ -33,8 +33,7 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
 
 class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceSoftmaxOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker)
+  SequenceSoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(LoDTensor) 1-D or 2-D input LoDTensor with the 2-nd dimension "
@@ -51,10 +50,14 @@ input Tensor can be either [N, 1] or [N], where N is the sum of the length
 of all sequences.
 
 The algorithm works as follows:
+
     for i-th sequence in a mini-batch:
-        $$Out(X[lod[i]:lod[i+1]], :) =
-            \frac{\exp(X[lod[i]:lod[i+1], :])}
-            {\sum(\exp(X[lod[i]:lod[i+1], :]))}$$
+
+$$
+Out(X[lod[i]:lod[i+1]], :) = \
+\frac{\exp(X[lod[i]:lod[i+1], :])} \
+{\sum(\exp(X[lod[i]:lod[i+1], :]))}
+$$
 
 For example, for a mini-batch of 3 sequences with variable-length,
 each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
@@ -99,7 +102,7 @@ REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp,
             ops::SequenceSoftmaxGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_softmax,
-    ops::SequenceSoftmaxKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_softmax_op.cu.cc b/paddle/operators/sequence_softmax_op.cu.cc
index 7023795a3b5777c250a9323a304a54849d763e9e..5f65b4daf97cf025b975d2d95212375b5fca01f8 100644
--- a/paddle/operators/sequence_softmax_op.cu.cc
+++ b/paddle/operators/sequence_softmax_op.cu.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/operators/sequence_softmax_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     sequence_softmax,
-    ops::SequenceSoftmaxKernel<paddle::platform::GPUPlace, float>)
-REGISTER_OP_GPU_KERNEL(
+    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, float>)
+REGISTER_OP_CUDA_KERNEL(
     sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_softmax_op.h b/paddle/operators/sequence_softmax_op.h
index 1b68dd0662ddfffc57b187945fe131e202c55174..e889e88cb34719b6648e3032754645fbb2807741 100644
--- a/paddle/operators/sequence_softmax_op.h
+++ b/paddle/operators/sequence_softmax_op.h
@@ -23,7 +23,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceSoftmaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -52,12 +52,13 @@ class SequenceSoftmaxKernel : public framework::OpKernel<T> {
       framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
       x_i.Resize(dims_i);
       out_i.Resize(dims_i);
-      math::SoftmaxFunctor<Place, T>()(ctx.device_context(), &x_i, &out_i);
+      math::SoftmaxFunctor<DeviceContext, T>()(
+          ctx.template device_context<DeviceContext>(), &x_i, &out_i);
     }
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -83,8 +84,9 @@ class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
       out_i.Resize(dims_i);
       out_grad_i.Resize(dims_i);
       x_grad_i.Resize(dims_i);
-      math::SoftmaxGradFunctor<Place, T>()(ctx.device_context(), &out_i,
-                                           &out_grad_i, &x_grad_i);
+      math::SoftmaxGradFunctor<DeviceContext, T>()(
+          ctx.template device_context<DeviceContext>(), &out_i, &out_grad_i,
+          &x_grad_i);
     }
   }
 };
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 72f4e4d5cbcd692423fa2a3e9ec8e7033b552c3c..a11c9624ce5e8485449dd6b420ad1f23ff3550c7 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -43,7 +43,7 @@ class SGDOp : public framework::OperatorWithKernel {
 
 class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SGDOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("LearningRate", "(Tensor) Learning rate of SGD");
@@ -55,48 +55,15 @@ SGD operator
 
 This operator implements one step of the stochastic gradient descent algorithm.
 
-$$param_out = param - learning_rate * grad$$
+$$param\_out = param - learning\_rate * grad$$
 
 )DOC");
   }
 };
 
-template <typename T>
-struct SparseSGDFunctor<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
-                  const framework::SelectedRows& input,
-                  const framework::Tensor& learning_rate,
-                  framework::Tensor* output) {
-    auto in_height = input.height();
-    auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
-
-    auto& in_value = input.value();
-    auto& in_rows = input.rows();
-
-    int64_t in_row_numel = in_value.numel() / in_rows.size();
-    PADDLE_ENFORCE_EQ(in_row_numel, output->numel() / in_height);
-
-    auto* in_data = in_value.data<T>();
-    auto* out_data = output->data<T>();
-    auto* lr = learning_rate.data<T>();
-
-    for (size_t i = 0; i < in_rows.size(); i++) {
-      for (int64_t j = 0; j < in_row_numel; j++) {
-        out_data[in_rows[i] * in_row_numel + j] -=
-            lr[0] * in_data[i * in_row_numel + j];
-      }
-    }
-  }
-};
-
-template struct SparseSGDFunctor<platform::CPUPlace, float>;
-template struct SparseSGDFunctor<platform::CPUPlace, double>;
-
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker);
-REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<paddle::platform::CPUPlace, float>,
-                       ops::SGDOpKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<float>, ops::SGDOpKernel<double>);
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
index 7b6c5ec30628b521b594ceaa3b7f1e0e03e497e4..42f8f8b2f072f9d204dfadcd732926b5c98dc617 100644
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/sgd_op.h"
@@ -20,6 +20,19 @@ namespace paddle {
 namespace operators {
 
 namespace {
+
+template <typename T>
+__global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
+                          const int num, T* p_out) {
+  T lr = learning_rate[0];
+  int grid_size = blockDim.x * gridDim.x;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += grid_size) {
+    T g_data = g[i];
+    T p_data = p[i];
+    p_out[i] = p_data - lr * g_data;
+  }
+}
+
 template <typename T, int block_size>
 __global__ void SparseSGDFunctorKernel(const T* selected_rows,
                                        const int64_t* rows,
@@ -41,42 +54,65 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
 }  // namespace
 
 template <typename T>
-struct SparseSGDFunctor<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
-                  const framework::SelectedRows& input,
-                  const framework::Tensor& learning_rate,
-                  framework::Tensor* output) {
-    auto in_height = input.height();
-    auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
-
-    auto& in_value = input.value();
-    auto& in_rows = input.rows();
-
-    int64_t in_row_numel = in_value.numel() / in_rows.size();
-    PADDLE_ENFORCE_EQ(in_row_numel, output->numel() / in_height);
-
-    auto* in_data = in_value.data<T>();
-    auto* out_data = output->data<T>();
-
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid(1, in_rows.size());
-    SparseSGDFunctorKernel<
-        T, 256><<<grid, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(in_data, in_rows.data(),
-                                   learning_rate.data<T>(), out_data,
-                                   in_row_numel);
+class SGDOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param = ctx.Input<framework::Tensor>("Param");
+    auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    auto* grad_var = ctx.InputVar("Grad");
+    // Actually, all tensors are LoDTensor except SelectedRows.
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      param_out->mutable_data<T>(ctx.GetPlace());
+      auto* grad = ctx.Input<framework::Tensor>("Grad");
+      auto* grad_data = grad->data<T>();
+      auto* param_data = param->data<T>();
+      auto* param_out_data = param_out->data<T>();
+
+      int block = 512;
+      int grid = (param->numel() + block - 1) / block;
+
+      SGDKernel<T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
+          grad_data, param_data, learning_rate->data<T>(), param->numel(),
+          param_out_data);
+
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
+      // This manual optimization brings difficulty to track data dependency.
+      // It's better to find a more elegant solution.
+      PADDLE_ENFORCE_EQ(param, param_out);
+      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+
+      auto in_height = grad->height();
+      auto out_dims = param_out->dims();
+      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
+
+      auto& in_value = grad->value();
+      auto& in_rows = grad->rows();
+
+      int64_t in_row_numel = in_value.numel() / in_rows.size();
+      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
+
+      auto* in_data = in_value.data<T>();
+      auto* out_data = param_out->data<T>();
+
+      const int block_size = 256;
+      dim3 threads(block_size, 1);
+      dim3 grid(1, in_rows.size());
+      SparseSGDFunctorKernel<
+          T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+          in_data, in_rows.data(), learning_rate->data<T>(), out_data,
+          in_row_numel);
+
+    } else {
+      PADDLE_THROW("Unsupported Variable Type of Grad");
+    }
   }
 };
-
-template struct SparseSGDFunctor<platform::GPUPlace, float>;
-template struct SparseSGDFunctor<platform::GPUPlace, double>;
-
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<paddle::platform::GPUPlace, float>,
-                       ops::SGDOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(sgd, ops::SGDOpCUDAKernel<float>,
+                        ops::SGDOpCUDAKernel<double>);
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index 78b595fc6c63d775b627f23cafa9458f1dadd4e5..a6c544591e1172320f6cf7192bf640ff25225b99 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -20,15 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
-struct SparseSGDFunctor {
-  void operator()(const platform::DeviceContext& context,
-                  const framework::SelectedRows& input,
-                  const framework::Tensor& learning_rate,
-                  framework::Tensor* output);
-};
-
-template <typename Place, typename T>
+template <typename T>
 class SGDOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -45,19 +37,36 @@ class SGDOpKernel : public framework::OpKernel<T> {
       auto p = framework::EigenVector<T>::Flatten(*param);
       auto g = framework::EigenVector<T>::Flatten(*grad);
       auto o = framework::EigenVector<T>::Flatten(*param_out);
-      auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
-      auto place = ctx.GetEigenDevice<Place>();
+      auto* lr = learning_rate->data<T>();
 
-      Eigen::DSizes<int, 1> grad_dsize(grad->numel());
-      o.device(place) = p - lr.broadcast(grad_dsize) * g;
+      o = p - lr[0] * g;
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
       // This manual optimization brings difficulty to track data dependency.
       // It's better to find a more elegant solution.
       PADDLE_ENFORCE_EQ(param, param_out);
       auto* grad = ctx.Input<framework::SelectedRows>("Grad");
-      SparseSGDFunctor<Place, T> functor;
-      functor(ctx.device_context(), *grad, *learning_rate, param_out);
+
+      auto in_height = grad->height();
+      auto out_dims = param_out->dims();
+      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
+
+      auto& in_value = grad->value();
+      auto& in_rows = grad->rows();
+
+      int64_t in_row_numel = in_value.numel() / in_rows.size();
+      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
+
+      auto* in_data = in_value.data<T>();
+      auto* out_data = param_out->data<T>();
+      auto* lr = learning_rate->data<T>();
+
+      for (size_t i = 0; i < in_rows.size(); i++) {
+        for (int64_t j = 0; j < in_row_numel; j++) {
+          out_data[in_rows[i] * in_row_numel + j] -=
+              lr[0] * in_data[i * in_row_numel + j];
+        }
+      }
     } else {
       PADDLE_THROW("Unsupported Variable Type of Grad");
     }
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
index 65bccc0c81d0ad9674649933a20ec7b09fec5b37..bf870115a4d7b6f4d578df7707826973d4363ba6 100644
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -1,17 +1,18 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor.h"
 #include "paddle/operators/array_operator.h"
 #include "paddle/operators/math/math_function.h"
 
@@ -27,11 +28,11 @@ class ShrinkRNNMemoryOp : public ArrayOp {
       : ArrayOp(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *x_var = scope.FindVar(Input("X"));
     PADDLE_ENFORCE(x_var != nullptr, "Input X must be set");
     auto &x_tensor = x_var->Get<framework::LoDTensor>();
-    size_t offset = this->GetOffset(scope, dev_ctx);
+    size_t offset = this->GetOffset(scope, place);
     auto *rank_table_var = scope.FindVar(Input("RankTable"));
     PADDLE_ENFORCE(rank_table_var != nullptr, "RankTable must be set");
     auto &rank_table = rank_table_var->Get<framework::LoDRankTable>();
@@ -44,24 +45,48 @@ class ShrinkRNNMemoryOp : public ArrayOp {
         rank_items.begin();
 
     auto *out_var = scope.FindVar(Output("Out"));
-    PADDLE_ENFORCE(out_var != nullptr, "Output Out must be set");
+    PADDLE_ENFORCE(out_var != nullptr, "Output(Out) must be set.");
     auto &out_tensor = *out_var->GetMutable<framework::LoDTensor>();
+
+    size_t height = dst_num_rows;
+
+    // do shrink for the top level LoD
+    if (x_tensor.lod().size() > 0 &&
+        x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) {
+      auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0,
+                                                              dst_num_rows, 0);
+      height = lod_offset.second.second;
+      auto out_lod = out_tensor.mutable_lod();
+      framework::AppendLoD(out_lod, lod_offset.first);
+    }
+
     if (dst_num_rows != 0) {
-      out_tensor.ShareDataWith(x_tensor.Slice(0, dst_num_rows));
+      out_tensor.ShareDataWith(x_tensor.Slice(0, height));
     }
   }
 };
 
 class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ShrinkRNNMemoryOpProtoMaker(framework::OpProto *proto,
-                              framework::OpAttrChecker *op_checker)
+  ShrinkRNNMemoryOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "");
-    AddInput("RankTable", "");
-    AddInput("I", "");
-    AddOutput("Out", "");
-    AddComment("");
+    AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
+    AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
+    AddInput("I",
+             "(LoDTensor) The step index. The RNN step memory 'X' will be "
+             "shrinked to match the size of the input of the index'th step.");
+    AddOutput("Out", "(LoDTensor) The shrinked RNN step memory.");
+    AddComment(R"DOC(
+This operator is used to shrink output batch of memory defined in dynamic RNN.
+
+Dynamic RNN is able to handle variable-length sequences, in which, sequences in
+a mini-batch are sorted by their lengths first. After that, the longest sequence
+becomes the first one in the sorted batch, followed by the second longest, the
+third longest, and so on. Dynamic RNN then slices a batch input timestep by
+timestep from the sorted input. Once any sequence in the input batch reaches its
+end, memory defined in dynamicRNN has to shrink its outputs to adapt to the input
+batch size for the next time step.
+)DOC");
   }
 };
 
@@ -84,7 +109,7 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
       : ArrayOp(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out")));
     auto *dx_var = scope.FindVar(Output(framework::GradVarName("X")));
     PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr");
@@ -96,19 +121,24 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
     dx_tensor.Resize(x_tensor.dims());
     dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
 
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
     if (dout_var == nullptr) {  // dx_tensor fill zero
       math::set_constant(dev_ctx, &dx_tensor, 0.0f);
     } else {
       auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
       auto height = dout_tensor.dims()[0];
-      dx_tensor.Slice(0, static_cast<int>(height))
-          .CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx);
-      if (dx_tensor.dims()[0] < height) {
+      auto slice = dx_tensor.Slice(0, static_cast<int>(height));
+      framework::Copy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
+      if (dx_tensor.dims()[0] > height) {
         auto rest_tensor = dx_tensor.Slice(
-            static_cast<int>(height), static_cast<int>(dout_tensor.dims()[0]));
+            static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
         math::set_constant(dev_ctx, &rest_tensor, 0.0f);
       }
     }
+    dx_tensor.set_lod(x_tensor.lod());
   }
 };
 
@@ -119,6 +149,7 @@ class ShrinkRNNMemoryGradInferShape : public framework::InferShapeBase {
     PADDLE_ENFORCE(context->HasOutput(framework::GradVarName("X")));
     context->SetOutputDim(framework::GradVarName("X"),
                           context->GetInputDim("X"));
+    context->ShareLoD("X", framework::GradVarName("X"));
   }
 };
 
@@ -127,14 +158,14 @@ class ShrinkRNNGradOpMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
     op->SetType("shrink_rnn_memory_grad");
     op->SetInput("X", Input("X"));
     op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(op);
+    return std::unique_ptr<framework::OpDesc>(op);
   }
 };
 
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
index d9e40546523c60b0a7eec2e0593446258996ba58..c526a88a127da12a6384777bca31b60873844d94 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h"
 
@@ -25,20 +25,19 @@ class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
 
     auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Labels");
+    auto labels_dims = ctx->GetInputDim("Label");
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
-                      "Input(Labels)'s rank should be 2.");
+                      "Input(Label)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
-                      "The 1st dimension of Input(X) and Input(Labels) should "
+                      "The 1st dimension of Input(X) and Input(Label) should "
                       "be equal.");
     PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
-                      "The 2nd dimension of Input(X) and Input(Labels) should "
+                      "The 2nd dimension of Input(X) and Input(Label) should "
                       "be equal.");
 
     ctx->SetOutputDim("Out", x_dims);
@@ -53,26 +52,25 @@ class SigmoidCrossEntropyWithLogitsGradOp
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) shoudl be not null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                    "Output(X@GRAD) should be not null.");
 
     auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Labels");
+    auto labels_dims = ctx->GetInputDim("Label");
     auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
-                      "Input(Labels)'s rank should be 2.");
+                      "Input(Label)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(dout_dims.size(), 2,
                       "Input(Out@Grad)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
-                      "The 1st dimension of Input(X) and Input(Labels) should "
+                      "The 1st dimension of Input(X) and Input(Label) should "
                       "be equal.");
     PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
-                      "The 2nd dimension of Input(X) and Input(Labels) should "
+                      "The 2nd dimension of Input(X) and Input(Label) should "
                       "be equal.");
     PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0],
                       "The 1st dimension of Input(X) and Input(Out@Grad) "
@@ -88,8 +86,8 @@ class SigmoidCrossEntropyWithLogitsGradOp
 class SigmoidCrossEntropyWithLogitsOpMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  SigmoidCrossEntropyWithLogitsOpMaker(framework::OpProto* proto,
-                                       framework::OpAttrChecker* op_checker)
+  SigmoidCrossEntropyWithLogitsOpMaker(OpProto* proto,
+                                       OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
@@ -97,7 +95,7 @@ class SigmoidCrossEntropyWithLogitsOpMaker
              "This input is a tensor of logits computed by the previous "
              " operator. Logits are unscaled log probabilities given as "
              "log(p/(1-p)).");
-    AddInput("Labels",
+    AddInput("Label",
              "(Tensor, default Tensor<float>), a 2-D tensor of the same type "
              "and shape as X. This input is a tensor of probabalistic labels "
              "for each logit");
@@ -144,7 +142,7 @@ REGISTER_OP(sigmoid_cross_entropy_with_logits,
             ops::SigmoidCrossEntropyWithLogitsGradOp);
 REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
                        ops::SigmoidCrossEntropyWithLogitsKernel<
-                           paddle::platform::CPUPlace, float>);
+                           paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
                        ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::CPUPlace, float>);
+                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
index 32a39956a14a206373b7b4c141dad19577d171f0..3f393265f48b428dca8703ff77688de979fb63df 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -1,24 +1,24 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(sigmoid_cross_entropy_with_logits,
-                       ops::SigmoidCrossEntropyWithLogitsKernel<
-                           paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits,
+                        ops::SigmoidCrossEntropyWithLogitsKernel<
+                            paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                        ops::SigmoidCrossEntropyWithLogitsGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
index 41c619f181c878f08959a8ca461c60af5ffdff2a..b78bcc436e9fa5c5d4db3fbb22224e328c3bc3c2 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -20,20 +20,19 @@ namespace paddle {
 namespace operators {
 
 // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels =
-        context.Input<framework::Tensor>("Labels");
+    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
     framework::Tensor *Out = context.Output<framework::Tensor>("Out");
     Out->mutable_data<T>(context.GetPlace());
 
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto labels = framework::EigenVector<T>::Flatten(*Labels);
     auto out = framework::EigenVector<T>::Flatten(*Out);
-    auto place = context.GetEigenDevice<Place>();
+    auto &place = *context.device_context<DeviceContext>().eigen_device();
 
     // term1 = max(x, 0)
     auto term1 = x.cwiseMax(static_cast<T>(0));
@@ -47,13 +46,12 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
 };
 
 // dX = sigmoid(X) - labels
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels =
-        context.Input<framework::Tensor>("Labels");
+    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
     const framework::Tensor *dOut =
         context.Input<framework::Tensor>(framework::GradVarName("Out"));
     framework::Tensor *dX =
@@ -64,7 +62,8 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
     auto labels = framework::EigenVector<T>::Flatten(*Labels);
     auto dout = framework::EigenVector<T>::Flatten(*dOut);
     auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+    auto &place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto sigmoid_x = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
     dx.device(place) = dout * (sigmoid_x - labels);
diff --git a/paddle/operators/sign_op.cc b/paddle/operators/sign_op.cc
index 08bf2e4e7cc101a3bcc907d3b40ee82347b39f80..f63eaa4464cc668acdb8e5b8a74ad5bba936db44 100644
--- a/paddle/operators/sign_op.cc
+++ b/paddle/operators/sign_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/sign_op.h"
 
@@ -34,7 +34,7 @@ class SignOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class SignOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SignOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) Input tensor of sign operator.");
     AddOutput("Out", "(Tensor) Output tensor of sign operator.");
@@ -50,13 +50,13 @@ class SignGradMaker : public framework::SingleGradOpDescMaker {
  public:
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("scale");
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttr("scale", 0.0f);
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
@@ -67,5 +67,5 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker<float>,
                   ops::SignGradMaker);
-REGISTER_OP_CPU_KERNEL(sign,
-                       ops::SignKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    sign, ops::SignKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sign_op.cu b/paddle/operators/sign_op.cu
index 4d0638cb97d84bf650fb23e4d2a201adc51a4b68..f224880cffb2154a7c46b8a4701d7357e67bb70c 100644
--- a/paddle/operators/sign_op.cu
+++ b/paddle/operators/sign_op.cu
@@ -1,18 +1,19 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/sign_op.h"
 
-REGISTER_OP_GPU_KERNEL(
-    sign, paddle::operators::SignKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    sign,
+    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sign_op.h b/paddle/operators/sign_op.h
index ab5cd4bac019d602c63ea51629fb85fa7e206841..9fe49ae1a2161d9f1472eef830c11b0f8305c568 100644
--- a/paddle/operators/sign_op.h
+++ b/paddle/operators/sign_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -19,7 +19,7 @@
 
 namespace paddle {
 namespace operators {
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SignKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& context) const {
@@ -29,7 +29,8 @@ class SignKernel : public framework::OpKernel<T> {
 
     auto eigen_out = framework::EigenVector<T>::Flatten(*out);
     auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     eigen_out.device(place) = eigen_in.sign();
   }
 };
diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc
index ebf7b43700a7498aa18b5f648b0b8c2c4e7b442b..dcb18d729da69beaa556e4b93129dafb08b72c06 100644
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/smooth_l1_loss_op.h"
 
@@ -22,22 +22,20 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
 
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same.");
+    PADDLE_ENFORCE_EQ(x_dims, y_dims);
     PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "The tensor rank of X must be at least 2.");
+                      "The tensor rank of Input(X) should not be less than 2.");
     if (ctx->HasInput("InsideWeight")) {
       PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
                      "If weights are provided, must specify both "
                      "inside and outside weights.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims,
-                        "The shape of InsideWeight must be same as X.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims,
-                        "The shape of OutsideWeight must be same as X.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims);
     }
 
     ctx->SetOutputDim("Diff", x_dims);
@@ -49,29 +47,32 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SmoothL1LossOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  SmoothL1LossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "The input tensor of smooth l1 loss op."
-             "The rank should be greater or equal to 2 with shape "
-             "[batch_size, value_dim1, value_dim2, ..., value_dimN]");
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "The input value of smooth l1 loss op with shape "
+             "[batch_size, dim1, ..., dimN].");
     AddInput("Y",
-             "The target tensor of smooth l1 loss op "
-             "with the same shape as X.");
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "The target value of smooth l1 loss op with same shape as X.");
     AddInput("InsideWeight",
-             "Optional input tensor of smooth l1 loss op with the same shape "
-             "as X. If provided, the result of (X - Y) will be multiplied "
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "This input is optional and should have same shape with X. "
+             "If provided, the result of (X - Y) will be multiplied "
              "by this tensor element by element.")
         .AsDispensable();
     AddInput("OutsideWeight",
-             "Optinal input of smooth l1 loss op with the same shape as X."
-             "If provided, the output smooth l1 loss will be multiplied by "
-             "this tensor element by element.")
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "This input is optional and should have same shape with X. "
+             "If provided, the out smooth l1 loss will be multiplied by this "
+             "tensor element by element.")
         .AsDispensable();
-    AddOutput("Diff", "Intermediate variable to cache InsideWeight*(X-Y).")
+    AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).")
         .AsIntermediate();
-    AddOutput("Out", "Smooth l1 loss.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>) A tensor with rank be 2. "
+              "The output smooth l1 loss with shape [batch_size, 1].");
     AddAttr<AttrType>("sigma",
                       "Hyper parameter of smooth l1 loss op."
                       "A float scalar with default value 3.0.")
@@ -79,15 +80,23 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Smooth L1 Loss Operator.
 
-This operator computes the smooth l1 loss for input and target.
-The operator takes the first dimension of input as the batch size.
+This operator computes the smooth l1 loss for X and Y.
+The operator takes the first dimension of X and Y as batch size.
 For each instance, it computes the smooth l1 loss element by element first
-and then sums all the losses. So the resulting output shape
-is [batch_size, 1].
+and then sums all the losses. So the shape of Out is [batch_size, 1].
 
 The equation is:
-loss = $$0.5 * (\sigma * (x-y))^2$$   if $$|x - y| < 1 /({\sigma}^2)$$
-       $$\frac{|x - y| - 0.5}{{\sigma}^2}$$ otherwise
+$$
+Out_{\sigma}(X, Y)_i = \begin{cases}
+0.5 * (\sigma * (X_i - Y_i)) ^ 2
+\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\
+\frac{|X_i - Y_i| - 0.5}{{\sigma}^2},
+\quad otherwise
+\end{cases}
+$$
+
+In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
+element of Out, X and Y.
 
 )DOC");
   }
@@ -128,7 +137,8 @@ REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp,
             ops::SmoothL1LossOpMaker<float>, smooth_l1_loss_grad,
             ops::SmoothL1LossGradOp);
 REGISTER_OP_CPU_KERNEL(
-    smooth_l1_loss, ops::SmoothL1LossKernel<paddle::platform::CPUPlace, float>);
+    smooth_l1_loss,
+    ops::SmoothL1LossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     smooth_l1_loss_grad,
-    ops::SmoothL1LossGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SmoothL1LossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/smooth_l1_loss_op.cu b/paddle/operators/smooth_l1_loss_op.cu
index 1c3172f43867741cd1f26979a366b2425f326321..213429bc370ef0d5b493b3a448df1b3bf0e4e87c 100644
--- a/paddle/operators/smooth_l1_loss_op.cu
+++ b/paddle/operators/smooth_l1_loss_op.cu
@@ -1,24 +1,25 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 
 #include "paddle/operators/smooth_l1_loss_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    smooth_l1_loss, ops::SmoothL1LossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    smooth_l1_loss,
+    ops::SmoothL1LossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     smooth_l1_loss_grad,
-    ops::SmoothL1LossGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SmoothL1LossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/smooth_l1_loss_op.h b/paddle/operators/smooth_l1_loss_op.h
index 39d0070b6c8909b8f433de48038240e851d9d6cf..3facfae116d711f86ea5c193562c20ea60a2efc9 100644
--- a/paddle/operators/smooth_l1_loss_op.h
+++ b/paddle/operators/smooth_l1_loss_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -44,7 +44,7 @@ struct SmoothL1LossForward {
   T sigma2;
 };
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class SmoothL1LossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -57,7 +57,8 @@ class SmoothL1LossKernel : public framework::OpKernel<T> {
 
     out0->mutable_data<T>(context.GetPlace());
     out1->mutable_data<T>(context.GetPlace());
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
 
     auto sigma = static_cast<T>(context.Attr<AttrType>("sigma"));
     T sigma2 = sigma * sigma;
@@ -67,12 +68,12 @@ class SmoothL1LossKernel : public framework::OpKernel<T> {
     auto y = EigenVector<T>::Flatten(*in1);
     auto diff = EigenVector<T>::Flatten(*out0);
 
-    diff.device(place) = x - y;
+    diff.device(*place) = x - y;
     // multiply inside weight
     if (has_weight) {
       auto inside_weight = EigenVector<T>::Flatten(*in2);
       // cache diff, reused in bp
-      diff.device(place) = diff * inside_weight;
+      diff.device(*place) = diff * inside_weight;
     }
 
     auto in_counts = in0->numel();
@@ -81,12 +82,12 @@ class SmoothL1LossKernel : public framework::OpKernel<T> {
                                    context.GetPlace());
     auto errors = EigenVector<T>::Flatten(ptensor_errors);
     // apply smooth l1 forward
-    errors.device(place) = diff.unaryExpr(SmoothL1LossForward<T>(sigma2));
+    errors.device(*place) = diff.unaryExpr(SmoothL1LossForward<T>(sigma2));
 
     // multiply outside weight
     if (has_weight) {
       auto outside_weight = EigenVector<T>::Flatten(*in3);
-      errors.device(place) = errors * outside_weight;
+      errors.device(*place) = errors * outside_weight;
     }
     auto loss = EigenVector<T>::Flatten(*out1);
     // first dimension of 'X' is the number of samples
@@ -94,7 +95,7 @@ class SmoothL1LossKernel : public framework::OpKernel<T> {
         framework::make_ddim({static_cast<int>(in0->dims()[0]),
                               static_cast<int>(in_counts / in0->dims()[0])});
     auto errors_mat_view = EigenMatrix<T>::From(ptensor_errors, mat_dims);
-    loss.device(place) = errors_mat_view.sum(Eigen::array<int, 1>({{1}}));
+    loss.device(*place) = errors_mat_view.sum(Eigen::array<int, 1>({{1}}));
   }
 };
 
@@ -114,7 +115,7 @@ struct SmoothL1LossBackward {
   T sigma2;
 };
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class SmoothL1LossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -126,7 +127,8 @@ class SmoothL1LossGradKernel : public framework::OpKernel<T> {
     T sigma2 = sigma * sigma;
     bool has_weight = (in0 != nullptr) && (in1 != nullptr);
 
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
 
     auto in_dims = in2->dims();
     auto counts = in2->numel();
@@ -139,7 +141,7 @@ class SmoothL1LossGradKernel : public framework::OpKernel<T> {
                                  context.GetPlace());
     auto diff = EigenVector<T>::Flatten(ptensor_diff);
     // apply smooth l1 backwoard
-    diff.device(place) = EigenVector<T>::Flatten(*in2).unaryExpr(
+    diff.device(*place) = EigenVector<T>::Flatten(*in2).unaryExpr(
         SmoothL1LossBackward<T>(sigma2));
 
     // compute weights
@@ -147,11 +149,11 @@ class SmoothL1LossGradKernel : public framework::OpKernel<T> {
     ptensor_weights.mutable_data<T>(mat_dims, context.GetPlace());
     auto weights = EigenMatrix<T>::From(ptensor_weights);
     // initialize to 1.0
-    weights.device(place) = weights.constant(static_cast<T>(1.0));
+    weights.device(*place) = weights.constant(static_cast<T>(1.0));
     if (has_weight) {
       auto inside_weight = EigenMatrix<T>::From(*in0, mat_dims);
       auto outside_weight = EigenMatrix<T>::From(*in1, mat_dims);
-      weights.device(place) = inside_weight * outside_weight;
+      weights.device(*place) = inside_weight * outside_weight;
     }
 
     // compute gradients
@@ -167,13 +169,13 @@ class SmoothL1LossGradKernel : public framework::OpKernel<T> {
     if (out0) {
       out0->mutable_data<T>(context.GetPlace());
       auto x_grad = EigenMatrix<T>::From(*out0, mat_dims);
-      x_grad.device(place) = gradients;
+      x_grad.device(*place) = gradients;
     }
 
     if (out1) {
       out1->mutable_data<T>(context.GetPlace());
       auto y_grad = EigenMatrix<T>::From(*out1, mat_dims);
-      y_grad.device(place) = -1 * gradients;
+      y_grad.device(*place) = -1 * gradients;
     }
   }
 };
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 93f89e33a73c5f4c6c0e5a8793a0abe7c692b656..cef1f1fc99d6a34db068257f94f935bf4435119d 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -24,25 +24,25 @@ class SoftmaxOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of SoftmaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"),
-                   "Output(Y) of SoftmaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SoftmaxOp should not be null.");
 
     auto x_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE(x_dims.size() == 2UL,
                    "The input of softmax op must be a matrix.");
-    ctx->SetOutputDim("Y", x_dims);
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
 class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftmaxOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input tensor of softmax. "
              "2-D with shape [batch_size, input_feature_dimensions].");
-    AddOutput("Y", "The normalized values with the same shape as X.");
+    AddOutput("Out", "The normalized values with the same shape as X.");
     AddComment(R"DOC(
 Softmax Operator.
 
@@ -59,8 +59,8 @@ Then the ratio of the exponential of the given dimension and the sum of
 exponential values of all the other dimensions is the output of the softmax
 operator.
 
-For each row `i` and each column `j` in input X, we have:
-    $$Y[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
+For each row $i$ and each column $j$ in Input(X), we have:
+    $$Out[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
 
 )DOC");
   }
@@ -71,12 +71,12 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input(Y@GRAD) should be not null.");
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Y"),
-                      ctx->GetInputDim(framework::GradVarName("Y")),
-                      "Input(Y) and its gradients should have a same shape.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should be not null.");
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Out"),
+                      ctx->GetInputDim(framework::GradVarName("Out")),
+                      "Input(Out) and its gradients should have a same shape.");
 
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
@@ -89,7 +89,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad,
             ops::SoftmaxOpGrad);
-REGISTER_OP_CPU_KERNEL(softmax,
-                       ops::SoftmaxKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    softmax_grad, ops::SoftmaxGradKernel<paddle::platform::CPUPlace, float>);
+    softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    softmax_grad,
+    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/softmax_op.cu.cc b/paddle/operators/softmax_op.cu.cc
index 013ace19ae3d4a1af29b570ba33fea3e4595fe5b..e7da40f3e82d5db858a795e9634abf57b884d6a2 100644
--- a/paddle/operators/softmax_op.cu.cc
+++ b/paddle/operators/softmax_op.cu.cc
@@ -1,22 +1,23 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/softmax_op.h"
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(softmax,
-                       ops::SoftmaxKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    softmax_grad, ops::SoftmaxGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    softmax, ops::SoftmaxKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    softmax_grad,
+    ops::SoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 44d1e63f1bb4798144218cd1caf01f133825bcff..63e379a3b31a6c75aab0c56b4ce1b988fa7f0318 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -21,32 +21,34 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SoftmaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<Tensor>("X");
-    auto* Y = context.Output<Tensor>("Y");
+    auto* Out = context.Output<Tensor>("Out");
 
     // allocate memory on device.
-    Y->mutable_data<T>(context.GetPlace());
+    Out->mutable_data<T>(context.GetPlace());
 
-    math::SoftmaxFunctor<Place, T>()(context.device_context(), X, Y);
+    math::SoftmaxFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), X, Out);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SoftmaxGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* Y = context.Input<Tensor>("Y");
-    auto* dY = context.Input<Tensor>(framework::GradVarName("Y"));
+    auto* Out = context.Input<Tensor>("Out");
+    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
     auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
 
     // allocate memory on device.
     dX->mutable_data<T>(context.GetPlace());
 
-    math::SoftmaxGradFunctor<Place, T>()(context.device_context(), Y, dY, dX);
+    math::SoftmaxGradFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), Out, dOut, dX);
   }
 };
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index 3dbb62d2e571eb92025c1b3fc0a6653c7cda007a..7135780c92dca00503ab098dc6930afd8fac0be8 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -1,10 +1,10 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/softmax_with_cross_entropy_op.h"
-#include <paddle/function/TensorType.h>
 
 namespace paddle {
 namespace operators {
@@ -21,8 +20,7 @@ namespace operators {
 class SoftmaxWithCrossEntropyOpMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftmaxWithCrossEntropyOpMaker(framework::OpProto* proto,
-                                 framework::OpAttrChecker* op_checker)
+  SoftmaxWithCrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Logits",
              "(Tensor, default: Tensor<float>), The unscaled log probabilities "
@@ -67,15 +65,15 @@ The equation is as follows:
 
 1) Hard label (one-hot label, so every sample has exactly one class)
 
-$$Loss_j = \f$ -\text{Logit}_{Label_j} +
+$$Loss_j =  -\text{Logit}_{Label_j} +
 \log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right),
-j = 1, ..., K $\f$$
+j = 1,..., K$$
 
 2) Soft label (each sample can have a distribution over all classes)
 
-$$Loss_j = \f$ -\sum_{i=0}^{K}\text{Label}_i\left(\text{Logit}_i -
+$$Loss_j =  -\sum_{i=0}^{K}\text{Label}_i \left(\text{Logit}_i -
 \log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right),
-j = 1,...,K $\f$$
+j = 1,...,K$$
 
 )DOC");
   }
@@ -120,7 +118,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
@@ -161,7 +159,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(
@@ -175,8 +173,8 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto* grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
     grad_op->SetType("softmax_with_cross_entropy_grad");
     grad_op->SetInput("Label", Input("Label"));
     grad_op->SetInput("Softmax", Output("Softmax"));
@@ -185,7 +183,7 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
     grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
     grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
index b1faddac3fd21aaf817caf9d3e57e664f4e0e2d5..61583c6161c3bbc62788dc8b6940ddcc29b2302a 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -1,10 +1,10 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -69,10 +69,10 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
     softmax->mutable_data<T>(context.GetPlace());
     loss->mutable_data<T>(context.GetPlace());
 
-    math::SoftmaxFunctor<platform::GPUPlace, T>()(context.device_context(),
-                                                  logits, softmax);
-    math::CrossEntropyFunctor<platform::GPUPlace, T>()(
-        context.device_context(), loss, softmax, labels,
+    math::SoftmaxFunctor<platform::CUDADeviceContext, T>()(
+        context.cuda_device_context(), logits, softmax);
+    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
+        context.cuda_device_context(), loss, softmax, labels,
         context.Attr<bool>("soft_label"));
   }
 };
@@ -98,18 +98,18 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
 
     if (context.Attr<bool>("soft_label")) {
       const T* label_data = labels->data<T>();
-      SoftCrossEntropyGradientKernel<T><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              context.device_context())
-                              .stream()>>>(logit_grad_data, loss_grad_data,
-                                           label_data, batch_size, class_num);
+      SoftCrossEntropyGradientKernel<
+          T><<<grid, block, 0,
+               context.template device_context<platform::CUDADeviceContext>()
+                   .stream()>>>(logit_grad_data, loss_grad_data, label_data,
+                                batch_size, class_num);
     } else {
       const int64_t* label_data = labels->data<int64_t>();
-      CrossEntropyGrad<T><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              context.device_context())
-                              .stream()>>>(logit_grad_data, loss_grad_data,
-                                           label_data, batch_size, class_num);
+      CrossEntropyGrad<
+          T><<<grid, block, 0,
+               context.template device_context<platform::CUDADeviceContext>()
+                   .stream()>>>(logit_grad_data, loss_grad_data, label_data,
+                                batch_size, class_num);
     }
   }
 };
@@ -118,9 +118,9 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy,
-                       ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
-                       ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
-REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy_grad,
-                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
-                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy,
+                        ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
+                        ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy_grad,
+                        ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
+                        ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
index c4ab3f74b4b07d13957d99e01aa4868fac719f61..6bde0f37e06ccf7d81487e0c99227287787c5d72 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -1,10 +1,10 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -40,11 +40,12 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
     softmax->mutable_data<T>(context.GetPlace());
     loss->mutable_data<T>(context.GetPlace());
 
-    math::SoftmaxFunctor<platform::CPUPlace, T>()(context.device_context(),
-                                                  logits, softmax);
-    math::CrossEntropyFunctor<platform::CPUPlace, T>()(
-        context.device_context(), loss, softmax, labels,
-        context.Attr<bool>("soft_label"));
+    auto& dev_ctx =
+        context.template device_context<platform::CPUDeviceContext>();
+    math::SoftmaxFunctor<platform::CPUDeviceContext, T>()(dev_ctx, logits,
+                                                          softmax);
+    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
+        dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"));
   }
 };
 
@@ -62,14 +63,15 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     const int class_num = logit_grad->dims()[1];
     auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
     auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
-
+    auto& place = *context.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
     if (context.Attr<bool>("soft_label")) {
       auto lbl_mat = EigenMatrix<T>::From(*labels);
-      logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
+      logit_grad_mat.device(place) =
           out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) *
           (logit_grad_mat - lbl_mat);
     } else {
-      logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
+      logit_grad_mat.device(place) =
           logit_grad_mat *
           out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num));
 
diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc
index db635f2ba0804143c9a2e04ff006dfbc8744f3fc..bd93c492015e074afe08ee167025aa6251b369d1 100644
--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -33,7 +34,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
     auto *out_true =
@@ -44,12 +45,15 @@ class SplitLoDTensorOp : public framework::OperatorBase {
     auto &x_lod = x.lod();
     auto &mask_dim = mask.dims();
 
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+
     std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
 #ifdef PADDLE_WITH_CUDA
-      cpu_mask->CopyFrom(mask, platform::CPUPlace(), dev_ctx);
+      framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
       PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
 #endif
@@ -105,10 +109,11 @@ class SplitLoDTensorOp : public framework::OperatorBase {
           continue;
         }
         // out[offset: offset+len] = x[each_range.begin: each_range.end]
-        out->Slice(static_cast<int>(offset), static_cast<int>(offset + len))
-            .CopyFrom(x.Slice(static_cast<int>(each_range.begin),
-                              static_cast<int>(each_range.end)),
-                      x.place(), dev_ctx);
+        auto slice = out->Slice(static_cast<int>(offset),
+                                static_cast<int>(offset + len));
+        framework::Copy(x.Slice(static_cast<int>(each_range.begin),
+                                static_cast<int>(each_range.end)),
+                        x.place(), dev_ctx, &slice);
         offset += len;
       }
     }
@@ -117,8 +122,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
 
 class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SplitLoDTensorOpProtoMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+  SplitLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input LoDTensor");
     AddInput("Mask", "A bool column vector which mask the input");
@@ -163,8 +167,8 @@ class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("merge_lod_tensor");
     grad_op->SetInput("InTrue", OutputGrad("OutTrue"));
     grad_op->SetInput("InFalse", OutputGrad("OutFalse"));
@@ -172,7 +176,7 @@ class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
     grad_op->SetInput("X", Input("X"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/split_op.cc b/paddle/operators/split_op.cc
index 275b25e96aa75fdbcb7275e272c49ea8d278d2c8..8d55ae5dd7b0e76acb9f21cb10b79cb7aca18a8d 100644
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
@@ -60,12 +60,18 @@ class SplitOp : public framework::OperatorWithKernel {
       }
     }
     ctx->SetOutputsDim("Out", outs_dims);
+    if (axis != 0) {
+      // Only pass LoD when not spliting along the first dim.
+      for (size_t i = 0; i < outs_number; ++i) {
+        ctx->ShareLoD("X", "Out", 0, i);
+      }
+    }
   }
 };
 
 class SplitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SplitOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SplitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) Input tensor of the split operator.");
     AddOutput("Out", "(Tensor) Output tensors of the split operator.")
@@ -108,13 +114,13 @@ class SplitGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto op = new framework::OpDesc();
     op->SetType("concat");
     op->SetInput("X", OutputGrad("Out"));
     op->SetOutput("Out", InputGrad("X"));
     op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(op);
+    return std::unique_ptr<framework::OpDesc>(op);
   }
 };
 
diff --git a/paddle/operators/split_op.cu.cc b/paddle/operators/split_op.cu.cc
index 93d1fc3c44cbc146c945c51af1abe6494572d1ae..dbad0bbf68d7924cfba80721bb3294b7e0cfac00 100644
--- a/paddle/operators/split_op.cu.cc
+++ b/paddle/operators/split_op.cu.cc
@@ -14,5 +14,5 @@ limitations under the License. */
 
 #include "paddle/operators/split_op.h"
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(split,
-                       ops::SplitOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/split_op.h b/paddle/operators/split_op.h
index fa26e5f677b18c84b45dd583004d02cab4c1d375..a38c435d531d7da2a1a60eb2c455bc1782c1cd4c 100644
--- a/paddle/operators/split_op.h
+++ b/paddle/operators/split_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SplitOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/spp_op.cc b/paddle/operators/spp_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c0aa87b0f06ca9c7d156dfdf8df188da68ac1450
--- /dev/null
+++ b/paddle/operators/spp_op.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/spp_op.h"
+namespace paddle {
+namespace operators {
+
+class SppOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SppOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of spp operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of spp operator."
+              "N * M."
+              "M = C * H * W");
+    AddAttr<int>("pyramid_height", "(int), multi level pooling");
+    AddAttr<std::string>(
+        "pooling_type",
+        "(string), pooling type, can be \"max\" for max-pooling "
+        "and \"avg\" for average-pooling.")
+        .InEnum({"max", "avg"});
+    AddComment(R"DOC(
+        "With spatial pyramid pooling, the input image can
+        be of any sizes. This not only allows arbitrary aspect
+        ratios, but also allows arbitrary scales. We can resize
+        the input image to any scale (e.g., min(w, h)=180, 224,
+        ...) and apply the same deep network. When the
+        input image is at different scales, the network (with
+        the same filter sizes) will extract features at different
+        scales. The scales play important roles in traditional
+        methods.
+        Input shape: $(N, C_{in}, H_{in}, W_{in})$
+        Output shape: $(H_{out}, W_{out})$
+        Where
+          $$
+            H_{out} = N \\
+            W_{out} = (((4^pyramid_height) - 1) / (4 - 1))$ * C_{in}
+          $$
+        paper https://arxiv.org/pdf/1406.4729v4.pdf
+        )DOC");
+  }
+};
+
+class SppOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SppOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SppOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    int pyramid_height = ctx->Attrs().Get<int>("pyramid_height");
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
+                   "Spping intput must be of 4-dimensional.");
+    int outlen = ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1];
+    std::vector<int64_t> output_shape({in_x_dims[0], outlen});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class SppOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(spp, ops::SppOp, ops::SppOpMaker, spp_grad, ops::SppOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    spp, ops::SppKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SppKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    spp_grad, ops::SppGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SppGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/spp_op.cu.cc b/paddle/operators/spp_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..761e4d6c4a9639898ba548d56bed3c8817720c1b
--- /dev/null
+++ b/paddle/operators/spp_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/spp_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    spp, ops::SppKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SppKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    spp_grad, ops::SppGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SppGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/spp_op.h b/paddle/operators/spp_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f35b305d02c73bcae6e72b8afa5ce55148ea98b8
--- /dev/null
+++ b/paddle/operators/spp_op.h
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/pooling.h"
+#include "paddle/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class SppKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    int pyramid_height = context.template Attr<int>("pyramid_height");
+    std::string pooling_type =
+        context.template Attr<std::string>("pooling_type");
+    out->mutable_data<T>(context.GetPlace());
+    auto out_stride = framework::stride(out->dims());
+    int input_h = in_x->dims()[2];
+    int input_w = in_x->dims()[3];
+    size_t output_offset = 0;
+    for (int p = 0; p < pyramid_height; ++p) {
+      int bins = std::pow(2, p);
+      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
+      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
+      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
+      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
+      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
+      std::vector<int> strides({kernel_size_h, kernel_size_w});
+      std::vector<int> paddings({padding_h, padding_w});
+      // pooling output shape
+      framework::Tensor out_level;
+      std::vector<int64_t> output_shape_vec(
+          {in_x->dims()[0], in_x->dims()[1], bins, bins});
+      framework::DDim output_shape(framework::make_ddim(output_shape_vec));
+      out_level.mutable_data<T>(output_shape, context.GetPlace());
+      // pooling
+      if (pooling_type == "max") {
+        math::Pool2dFunctor<DeviceContext, math::MaxPool<T>, T> pool_forward;
+        math::MaxPool<T> max_process;
+        pool_forward(context.template device_context<DeviceContext>(), *in_x,
+                     kernel_size, strides, paddings, max_process, &out_level);
+      } else if (pooling_type == "avg") {
+        math::Pool2dFunctor<DeviceContext, math::AvgPool<T>, T> pool_forward;
+        math::AvgPool<T> avg_process;
+        pool_forward(context.template device_context<DeviceContext>(), *in_x,
+                     kernel_size, strides, paddings, avg_process, &out_level);
+      }
+      // flatten pooling output shape
+      int output_flatten_w = in_x->dims()[1] * bins * bins;
+      std::vector<int64_t> output_flatten_shape_vec(
+          {in_x->dims()[0], output_flatten_w});
+      framework::DDim output_flatten_shape(
+          framework::make_ddim(output_flatten_shape_vec));
+      out_level.Resize(output_flatten_shape);
+      // concat
+      auto out_level_stride = framework::stride(out_level.dims());
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out_level.data<T>(), out_level_stride, out_level.dims(),
+                       out_stride, out->data<T>() + output_offset);
+      output_offset += out_level.dims()[1] * out_level_stride[1];
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class SppGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    int pyramid_height = context.template Attr<int>("pyramid_height");
+    std::string pooling_type =
+        context.template Attr<std::string>("pooling_type");
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    in_x_grad->mutable_data<T>(context.GetPlace());
+    zero(device_ctx, in_x_grad, static_cast<T>(0));
+    auto out_stride = framework::stride(out->dims());
+    int input_h = in_x->dims()[2];
+    int input_w = in_x->dims()[3];
+    size_t out_offset = 0;
+    for (int p = 0; p < pyramid_height; ++p) {
+      int bins = std::pow(2, p);
+      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
+      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
+      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
+      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
+      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
+      std::vector<int> strides({kernel_size_h, kernel_size_w});
+      std::vector<int> paddings({padding_h, padding_w});
+      // split out and outgrad  ...  to flatten
+      framework::Tensor out_level;
+      framework::Tensor outgrad_level;
+      int out_flatten_w = in_x->dims()[1] * bins * bins;
+      std::vector<int64_t> out_flatten_shape_vec(
+          {in_x->dims()[0], out_flatten_w});
+      framework::DDim out_flatten_shape(
+          framework::make_ddim(out_flatten_shape_vec));
+      out_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
+      outgrad_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
+      auto flatten_stride = framework::stride(out_level.dims());
+      // memcpy
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out->data<T>() + out_offset, out_stride,
+                       out_level.dims(), flatten_stride, out_level.data<T>());
+
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out_grad->data<T>() + out_offset, out_stride,
+                       outgrad_level.dims(), flatten_stride,
+                       outgrad_level.data<T>());
+      out_offset += out_level.dims()[1] * out_stride[1];
+      // flatten backward to nchw
+
+      std::vector<int64_t> out_shape_vec({in_x->dims()[0], in_x->dims()[1]});
+      out_shape_vec.push_back(
+          (input_h - kernel_size_h + 2 * padding_h) / kernel_size_h + 1);
+      out_shape_vec.push_back(
+          (input_w - kernel_size_w + 2 * padding_w) / kernel_size_w + 1);
+      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+      out_level.ShareDataWith(out_level);
+      out_level.Resize(out_shape);
+      outgrad_level.ShareDataWith(outgrad_level);
+      outgrad_level.Resize(out_shape);
+      // pooling backward
+      if (pooling_type == "max") {
+        math::MaxPool2dGradFunctor<DeviceContext, T> pool2d_backward;
+        pool2d_backward(context.template device_context<DeviceContext>(), *in_x,
+                        *&out_level, *&outgrad_level, kernel_size, strides,
+                        paddings, in_x_grad);
+      } else if (pooling_type == "avg") {
+        math::Pool2dGradFunctor<DeviceContext, math::AvgPoolGrad<T>, T>
+            pool_backward;
+        math::AvgPoolGrad<T> avg_process;
+        pool_backward(context.template device_context<DeviceContext>(), *in_x,
+                      *&out_level, *&outgrad_level, kernel_size, strides,
+                      paddings, avg_process, in_x_grad);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
index bec2a2c18ae8da892ee7d71f45afe53c887c0f57..9e097176f3434e81e31f2ecf4093af47b654e816 100644
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -56,8 +56,7 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
 
 class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SquaredL2DistanceOpMaker(framework::OpProto* proto,
-                           framework::OpAttrChecker* op_checker)
+  SquaredL2DistanceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) Input of SquaredL2DistanceOp.");
     AddInput("Y", "(Tensor) Target of SquaredL2DistanceOp.");
@@ -115,7 +114,7 @@ REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp,
             ops::SquaredL2DistanceGradOp);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_distance,
-    ops::SquaredL2DistanceKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    squared_l2_distance_grad,
-    ops::SquaredL2DistanceGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SquaredL2DistanceKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(squared_l2_distance_grad,
+                       ops::SquaredL2DistanceGradKernel<
+                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/squared_l2_distance_op.cu b/paddle/operators/squared_l2_distance_op.cu
index 3fe62f1a9cb56722ea544b0fed052ac384e799aa..f2648dde5eb9c56aed3fad81521e6207dc2d973e 100644
--- a/paddle/operators/squared_l2_distance_op.cu
+++ b/paddle/operators/squared_l2_distance_op.cu
@@ -1,25 +1,25 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 
 #include "paddle/operators/squared_l2_distance_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     squared_l2_distance,
-    ops::SquaredL2DistanceKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    squared_l2_distance_grad,
-    ops::SquaredL2DistanceGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SquaredL2DistanceKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(squared_l2_distance_grad,
+                        ops::SquaredL2DistanceGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/squared_l2_distance_op.h b/paddle/operators/squared_l2_distance_op.h
index 259ef4029646914f83a112b9c6d7fdf8401483f6..5bd5f4819a35966b73038f433d38c06031e18715 100644
--- a/paddle/operators/squared_l2_distance_op.h
+++ b/paddle/operators/squared_l2_distance_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SquaredL2DistanceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -51,7 +51,8 @@ class SquaredL2DistanceKernel : public framework::OpKernel<T> {
     auto sub_result = EigenMatrix<T>::From(*out0);
     auto z = EigenVector<T>::Flatten(*out1);
 
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     auto x_dims = x.dimensions();
     auto y_dims = y.dimensions();
     // buffer the substraction result
@@ -67,7 +68,7 @@ class SquaredL2DistanceKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -89,7 +90,8 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
                     sub_result;
 
     // propagate back to input
-    auto eigen_place = context.GetEigenDevice<Place>();
+    auto& eigen_place =
+        *context.template device_context<DeviceContext>().eigen_device();
     if (x_g) {
       x_g->mutable_data<T>(context.GetPlace());
       // eigen matrix
diff --git a/paddle/operators/squared_l2_norm_op.cc b/paddle/operators/squared_l2_norm_op.cc
index 3c10e6159f44bc8c21b1e79aefaa962c7a2b64ed..6626bf0375548eac457f960105ce33e63e1a3706 100644
--- a/paddle/operators/squared_l2_norm_op.cc
+++ b/paddle/operators/squared_l2_norm_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/squared_l2_norm_op.h"
 
@@ -48,8 +48,7 @@ class SquaredL2NormGradOp : public framework::OperatorWithKernel {
 
 class SquaredL2NormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SquaredL2NormOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+  SquaredL2NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The input of squared_l2_norm op.");
     AddOutput("Out", "(Scalar) The output of squared_l2_norm op.");
@@ -72,7 +71,7 @@ REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker,
             squared_l2_norm_grad, ops::SquaredL2NormGradOp);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_norm,
-    ops::SquaredL2NormKernel<paddle::platform::CPUPlace, float>);
+    ops::SquaredL2NormKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_norm_grad,
-    ops::SquaredL2NormGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SquaredL2NormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/squared_l2_norm_op.cu b/paddle/operators/squared_l2_norm_op.cu
index d384e9c28c9150fa901404478739ff809f29126f..b222113a8c82061bd841da440a714b66f6c1fb9c 100644
--- a/paddle/operators/squared_l2_norm_op.cu
+++ b/paddle/operators/squared_l2_norm_op.cu
@@ -1,24 +1,24 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/squared_l2_norm_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     squared_l2_norm,
-    ops::SquaredL2NormKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+    ops::SquaredL2NormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     squared_l2_norm_grad,
-    ops::SquaredL2NormGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SquaredL2NormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/squared_l2_norm_op.h b/paddle/operators/squared_l2_norm_op.h
index 48d7b1c2d56882f04330dbf27b0a92e37cb8874c..1ce26c775ed5700cee73d00f4c51d58a692a1152 100644
--- a/paddle/operators/squared_l2_norm_op.h
+++ b/paddle/operators/squared_l2_norm_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 // Out = sum(square(X))
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SquaredL2NormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -30,14 +30,15 @@ class SquaredL2NormKernel : public framework::OpKernel<T> {
 
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto out = framework::EigenScalar<T>::From(*Out);
-    auto place = context.GetEigenDevice<Place>();
+    auto *place =
+        context.template device_context<DeviceContext>().eigen_device();
 
-    out.device(place) = x.square().sum();
+    out.device(*place) = x.square().sum();
   }
 };
 
 // dX = X
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SquaredL2NormGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -53,10 +54,11 @@ class SquaredL2NormGradKernel : public framework::OpKernel<T> {
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto dout = framework::EigenVector<T>::Flatten(*dOut);
     auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+    auto *place =
+        context.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<int, 1> x_dsize(X->numel());
-    dx.device(place) = (dout.broadcast(x_dsize) * x) * static_cast<T>(2.0);
+    dx.device(*place) = (dout.broadcast(x_dsize) * x) * static_cast<T>(2.0);
   }
 };
 
diff --git a/paddle/operators/strided_memcpy.h b/paddle/operators/strided_memcpy.h
index c9dd80518424017d9834a2bf7aee14caa56c9d79..735cabcd973a3b5ea1ab8ab57091eae14e23b89b 100644
--- a/paddle/operators/strided_memcpy.h
+++ b/paddle/operators/strided_memcpy.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/operators/detail/strided_memcpy.h"
diff --git a/paddle/operators/strided_memcpy_test.cc b/paddle/operators/strided_memcpy_test.cc
index 68f064eaee5851333ddf9767b7138da83a28503d..06d81188558aad85c41c56ddefad3617d48da74c 100644
--- a/paddle/operators/strided_memcpy_test.cc
+++ b/paddle/operators/strided_memcpy_test.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/strided_memcpy.h"
 #include "gtest/gtest.h"
@@ -82,11 +82,13 @@ TEST(StridedMemcpy, GPUCrop) {
   };
   // clang-format on
 
-  platform::GPUPlace gpu0(0);
+  platform::CUDAPlace gpu0(0);
   platform::CPUPlace cpu;
 
+  platform::CUDADeviceContext ctx(gpu0);
+
   int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
+  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
 
   framework::DDim src_stride({5, 1});
 
@@ -96,7 +98,6 @@ TEST(StridedMemcpy, GPUCrop) {
   framework::DDim dst_dim({2, 2});
   framework::DDim dst_stride({2, 1});
 
-  platform::CUDADeviceContext ctx(gpu0);
   StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride,
                      gpu_dst);
 
@@ -120,11 +121,12 @@ TEST(StridedMemcpy, GPUConcat) {
   };
   // clang-format on
 
-  platform::GPUPlace gpu0(0);
+  platform::CUDAPlace gpu0(0);
   platform::CPUPlace cpu;
+  platform::CUDADeviceContext ctx(gpu0);
 
   int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
+  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
 
   int dst[8];
   int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
@@ -132,7 +134,6 @@ TEST(StridedMemcpy, GPUConcat) {
   framework::DDim src_stride({2, 1});
   framework::DDim dst_dim({2, 2});
   framework::DDim dst_stride({4, 1});
-  platform::CUDADeviceContext ctx(gpu0);
 
   StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
   StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride,
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index c2b7632b2865a3ef66051d815d7722a08c6a8cbd..88ed67f7ba2527e1d91e6bb30762d5dcf818761d 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -29,7 +29,7 @@ class SumOp : public framework::OperatorWithKernel {
                    "Output(Out) of SumOp should not be null.");
     if (ctx->IsRuntime() &&
         ctx->GetOutputsVarType("Out")[0] ==
-            framework::VarDesc::LOD_TENSOR_ARRAY) {
+            framework::proto::VarDesc::LOD_TENSOR_ARRAY) {
       return;  // skip runtime infershape when is tensor array;
     }
 
@@ -37,23 +37,43 @@ class SumOp : public framework::OperatorWithKernel {
     size_t N = x_dims.size();
     PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
 
-    auto in_dim = x_dims[0];
-    for (size_t i = 1; i < N; i++) {
-      auto dim = x_dims[i];
-      PADDLE_ENFORCE_EQ(in_dim, dim, "Input tensors must have same shape");
+    framework::DDim in_dim({0});
+    for (auto& x_dim : x_dims) {
+      if (framework::product(x_dim) == 0) {
+        continue;
+      }
+      if (framework::product(in_dim) == 0) {
+        in_dim = x_dim;
+      } else {
+        PADDLE_ENFORCE_EQ(in_dim, x_dim, "Input tensors must have same shape");
+      }
     }
     ctx->SetOutputDim("Out", in_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto x_vars = ctx.MultiInputVar("X");
     if (x_vars[0]->IsType<framework::LoDTensor>()) {
+      int dtype = -1;
+      for (auto& x_var : x_vars) {
+        auto& lod_tensor = x_var->Get<framework::LoDTensor>();
+        if (lod_tensor.numel() == 0) {
+          continue;
+        }
+        if (dtype == -1) {
+          dtype = framework::ToDataType(lod_tensor.type());
+        } else {
+          PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type()));
+        }
+      }
+      PADDLE_ENFORCE_NE(dtype, -1,
+                        "Sum operator should have at least one tensor");
+
       return framework::OpKernelType(
-          framework::ToDataType(x_vars[0]->Get<framework::LoDTensor>().type()),
-          ctx.device_context());
+          static_cast<framework::proto::DataType>(dtype), ctx.device_context());
     } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
       return framework::OpKernelType(
           framework::ToDataType(
@@ -78,7 +98,7 @@ class SumOp : public framework::OperatorWithKernel {
 
 class SumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SumOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  SumOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
         .AsDuplicable();
@@ -86,8 +106,8 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Sum operator.
 
-This operators sums the input tensors. All the inputs can carry the 
-LoD (Level of Details) information. However, the output only shares 
+This operators sums the input tensors. All the inputs can carry the
+LoD (Level of Details) information. However, the output only shares
 the LoD information with the first input.
 )DOC");
   }
@@ -95,25 +115,25 @@ the LoD information with the first input.
 
 class SumOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind& op_desc,
-                  framework::BlockDescBind* block) const override {
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
     auto& inputs = op_desc.Input("X");
-    auto var_type = framework::VarDesc::SELECTED_ROWS;
+    auto var_type = framework::proto::VarDesc::SELECTED_ROWS;
 
     for (auto& name : op_desc.Input("X")) {
       VLOG(10) << name << " "
-               << block->FindRecursiveOrCreateVar(name)->GetType();
+               << block->FindRecursiveOrCreateVar(name).GetType();
     }
 
     bool any_input_is_lod_tensor = std::any_of(
         inputs.begin(), inputs.end(), [block](const std::string& name) {
-          return block->FindRecursiveOrCreateVar(name)->GetType() ==
-                 framework::VarDesc::LOD_TENSOR;
+          return block->FindRecursiveOrCreateVar(name).GetType() ==
+                 framework::proto::VarDesc::LOD_TENSOR;
         });
 
     auto is_tensor_array = [block](const std::string& name) {
-      return detail::Ref(block->FindRecursiveOrCreateVar(name)).GetType() ==
-             framework::VarDesc::LOD_TENSOR_ARRAY;
+      return block->FindRecursiveOrCreateVar(name).GetType() ==
+             framework::proto::VarDesc::LOD_TENSOR_ARRAY;
     };
 
     bool any_input_is_tensor_array =
@@ -126,19 +146,18 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
         std::ostringstream os;
         for (auto& each : inputs) {
           os << "    " << each << " type is "
-             << detail::Ref(block->FindRecursiveOrCreateVar(each)).GetType()
-             << "\n";
+             << block->FindRecursiveOrCreateVar(each).GetType() << "\n";
         }
         PADDLE_ENFORCE(all_inputs_are_tensor_array,
                        "Not all inputs are tensor array:\n%s", os.str());
       }
-      var_type = framework::VarDesc::LOD_TENSOR_ARRAY;
+      var_type = framework::proto::VarDesc::LOD_TENSOR_ARRAY;
     } else if (any_input_is_lod_tensor) {
-      var_type = framework::VarDesc::LOD_TENSOR;
+      var_type = framework::proto::VarDesc::LOD_TENSOR;
     }
 
     auto out_var_name = op_desc.Output("Out").front();
-    auto& out_var = detail::Ref(block->FindRecursiveOrCreateVar(out_var_name));
+    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
     out_var.SetType(var_type);
     auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front()));
     out_var.SetDataType(in_var.GetDataType());
@@ -149,20 +168,19 @@ class SumGradMaker : public framework::GradOpDescMakerBase {
  public:
   using framework::GradOpDescMakerBase::GradOpDescMakerBase;
 
-  std::vector<std::unique_ptr<framework::OpDescBind>> operator()()
-      const override {
-    auto x_grads = InputGrad("X");
-    std::vector<std::unique_ptr<framework::OpDescBind>> grad_ops;
+  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
+    auto x_grads = InputGrad("X", false);
+    std::vector<std::unique_ptr<framework::OpDesc>> grad_ops;
     grad_ops.reserve(x_grads.size());
     auto og = OutputGrad("Out");
     std::transform(x_grads.begin(), x_grads.end(), std::back_inserter(grad_ops),
                    [&og](const std::string& x_grad) {
-                     auto* grad_op = new framework::OpDescBind();
+                     auto* grad_op = new framework::OpDesc();
                      grad_op->SetType("scale");
                      grad_op->SetInput("X", og);
                      grad_op->SetOutput("Out", {x_grad});
                      grad_op->SetAttr("scale", 1.0f);
-                     return std::unique_ptr<framework::OpDescBind>(grad_op);
+                     return std::unique_ptr<framework::OpDesc>(grad_op);
                    });
     return grad_ops;
   }
@@ -175,5 +193,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker,
                   ops::SumOpVarTypeInference);
-REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel<paddle::platform::CPUPlace, float>,
-                       ops::SumKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/sum_op.cu b/paddle/operators/sum_op.cu
index 5cf05b876b6d6a2ce61d9e10b7ec52ed3cef57d7..873155076c179d5280a418e25fd39fdaf4b0a2b2 100644
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
@@ -13,5 +13,8 @@ limitations under the License. */
 #include "paddle/operators/sum_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>,
-                       ops::SumKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    sum, ops::SumKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
index 4ca15611392b3117aa6c92cba95911eb8bebeb15..48201b344de0d3bd2b121a12389876dad095f10d 100644
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -26,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SumKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -37,27 +37,32 @@ class SumKernel : public framework::OpKernel<T> {
     bool in_place = out_var == in_vars[0];
 
     if (out_var->IsType<framework::LoDTensor>()) {
-      auto *out = context.Output<Tensor>("Out");
-      out->mutable_data<T>(context.GetPlace());
-
+      auto *out = context.Output<LoDTensor>("Out");
+      if (!in_place) {
+        out->mutable_data<T>(context.GetPlace());
+      }
       auto result = EigenVector<T>::Flatten(*out);
-
       if (!in_place) {
-        math::SetConstant<Place, T> constant_functor;
-        constant_functor(context.device_context(), out, 0.0);
+        math::SetConstant<DeviceContext, T> constant_functor;
+        constant_functor(context.template device_context<DeviceContext>(), out,
+                         0.0);
       }
 
-      math::SelectedRowsAddToTensor<Place, T> functor;
-      auto place = context.GetEigenDevice<Place>();
+      math::SelectedRowsAddToTensor<DeviceContext, T> functor;
+      auto &place =
+          *context.template device_context<DeviceContext>().eigen_device();
       // If in_place, just skip the first tensor
       for (int i = in_place ? 1 : 0; i < N; i++) {
         if (in_vars[i]->IsType<framework::LoDTensor>()) {
           auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
+          if (in_t.numel() == 0) {
+            continue;
+          }
           auto in = EigenVector<T>::Flatten(in_t);
           result.device(place) = result + in;
         } else if (in_vars[i]->IsType<framework::SelectedRows>()) {
           auto &in_t = in_vars[i]->Get<framework::SelectedRows>();
-          functor(context.device_context(), in_t, out);
+          functor(context.template device_context<DeviceContext>(), in_t, out);
         } else {
           PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
         }
@@ -65,6 +70,7 @@ class SumKernel : public framework::OpKernel<T> {
     } else if (out_var->IsType<framework::SelectedRows>()) {
       PADDLE_ENFORCE(!in_place, "SelectedRows not support inplace sum now");
       auto *out = context.Output<SelectedRows>("Out");
+      out->mutable_rows()->clear();
       auto *out_value = out->mutable_value();
 
       // Runtime InferShape
@@ -79,14 +85,14 @@ class SumKernel : public framework::OpKernel<T> {
       out_value->Resize(framework::make_ddim(in_dim_vec));
       out_value->mutable_data<T>(context.GetPlace());
 
-      math::SelectedRowsAddTo<Place, T> functor;
+      math::SelectedRowsAddTo<DeviceContext, T> functor;
 
       int64_t offset = 0;
       for (int i = 0; i < N; i++) {
         PADDLE_ENFORCE_EQ(out->height(),
-                          in_vars[i]->Get<SelectedRows>().height())
-        functor(context.device_context(), in_vars[i]->Get<SelectedRows>(),
-                offset, out);
+                          in_vars[i]->Get<SelectedRows>().height());
+        functor(context.template device_context<DeviceContext>(),
+                in_vars[i]->Get<SelectedRows>(), offset, out);
         offset += in_vars[i]->Get<SelectedRows>().value().numel();
       }
     } else if (out_var->IsType<framework::LoDTensorArray>()) {
@@ -102,14 +108,15 @@ class SumKernel : public framework::OpKernel<T> {
               out_array.resize(i + 1);
             }
             if (out_array[i].numel() == 0) {
-              out_array[i].CopyFrom(in_array[i], in_array[i].place(),
-                                    context.device_context());
+              framework::Copy(in_array[i], in_array[i].place(),
+                              context.device_context(), &out_array[i]);
               out_array[i].set_lod(in_array[i].lod());
             } else {
               PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
               auto in = EigenVector<T>::Flatten(in_array[i]);
               auto result = EigenVector<T>::Flatten(out_array[i]);
-              result.device(context.GetEigenDevice<Place>()) = result + in;
+              result.device(*context.template device_context<DeviceContext>()
+                                 .eigen_device()) = result + in;
             }
           }
         }
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
index ae1b48d7a8e3d573a5134a822a2ed5ef70511077..a70be8b8752d12433bb19b9953d80e397858834c 100644
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include "paddle/operators/array_operator.h"
 #include "paddle/operators/detail/safe_ref.h"
 namespace paddle {
@@ -25,11 +25,11 @@ class WriteToArrayOp : public ArrayOp {
       : ArrayOp(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
-    PADDLE_ENFORCE(x != nullptr, "X must be set");
+    if (x == nullptr) return;
     auto &x_tensor = x->Get<framework::LoDTensor>();
-    size_t offset = GetOffset(scope, dev_ctx);
+    size_t offset = GetOffset(scope, place);
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
     if (offset >= out->size()) {
@@ -37,16 +37,26 @@ class WriteToArrayOp : public ArrayOp {
                << " to " << offset + 1;
       out->resize(offset + 1);
     }
-    auto *out_tensor = &out->at(offset);
-    out_tensor->CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx);
-    out_tensor->set_lod(x_tensor.lod());
+    if (x_tensor.memory_size() > 0) {
+      auto *out_tensor = &out->at(offset);
+
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
+
+      Copy(x_tensor, place, dev_ctx, out_tensor);
+      out_tensor->set_lod(x_tensor.lod());
+    } else {
+      VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
+                  "nothing has been written to output array["
+               << offset << "].";
+    }
   }
 };
 
 class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  WriteToArrayOpProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  WriteToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(LoDTensor) the tensor will be written to tensor array");
     AddInput(
@@ -54,12 +64,16 @@ class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
         "(Tensor) the subscript index in tensor array. The number of element "
         "should be 1");
     AddOutput("Out", "(TensorArray) the tensor array will be written");
-    AddComment(R"DOC(Write a LoDTensor to a LoDTensor array.
+    AddComment(R"DOC(
+WriteToArray Operator.
+
+This operator writes a LoDTensor to a LoDTensor array.
 
-Assume T is LoDTensor, i is the subscript of the array, and A is the array. The
+Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The
 equation is
 
-A[i] = T
+$$A[i] = T$$
+
 )DOC");
   }
 };
@@ -70,7 +84,9 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
     PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index");
     PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1,
                       "The number of element of subscript index must be 1");
-    PADDLE_ENFORCE(context->HasInput("X"), NotHasXError());
+    if (!context->HasInput("X")) {
+      return;
+    }
     PADDLE_ENFORCE(context->HasOutput("Out"), NotHasOutError());
     context->SetOutputDim("Out", context->GetInputDim("X"));
   }
@@ -85,17 +101,17 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
 
 class WriteToArrayInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind &op_desc,
-                  framework::BlockDescBind *block) const override {
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
     auto x_name = op_desc.Input("X")[0];
     auto out_name = op_desc.Output("Out")[0];
     VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
-    auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name),
-                            "Cannot found %s", out_name);
-    out.SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
-    auto &x =
-        detail::Ref(block->FindVarRecursive(x_name), "Cannot found %s", x_name);
-    out.SetDataType(x.GetDataType());
+    auto &out = block->FindRecursiveOrCreateVar(out_name);
+    out.SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
+    auto *x = block->FindVarRecursive(x_name);
+    if (x != nullptr) {
+      out.SetDataType(x->GetDataType());
+    }
   }
 };
 
@@ -107,36 +123,45 @@ class ReadFromArrayOp : public ArrayOp {
                   const framework::AttributeMap &attrs)
       : ArrayOp(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
     PADDLE_ENFORCE(x != nullptr, "X must be set");
     auto &x_array = x->Get<framework::LoDTensorArray>();
     auto *out = scope.FindVar(Output("Out"));
     PADDLE_ENFORCE(out != nullptr, "Out must be set");
-    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
-    size_t offset = GetOffset(scope, dev_ctx);
-    PADDLE_ENFORCE_LT(offset, x_array.size());
-    out_tensor->CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx);
-    out_tensor->set_lod(x_array[offset].lod());
+    size_t offset = GetOffset(scope, place);
+    if (offset < x_array.size()) {
+      auto *out_tensor = out->GetMutable<framework::LoDTensor>();
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
+      framework::Copy(x_array[offset], place, dev_ctx, out_tensor);
+      out_tensor->set_lod(x_array[offset].lod());
+    } else {
+      VLOG(10) << "offset " << offset << " >= " << x_array.size();
+    }
   }
 };
 
 class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReadFromArrayProtoMaker(framework::OpProto *proto,
-                          framework::OpAttrChecker *op_checker)
+  ReadFromArrayProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(TensorArray) the array will be read from.");
     AddInput("I",
              "(Tensor) the subscript index in tensor array. The number of "
              "element should be 1");
     AddOutput("Out", "(LoDTensor) the tensor will be read from.");
-    AddComment(R"DOC(Read a LoDTensor from a LoDTensor Array
+    AddComment(R"DOC(
+ReadFromArray Operator.
 
-Assume T is LoDTensor, i is th e subscript of the array, and A is the array. The
+Read a LoDTensor from a LoDTensor Array.
+
+Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The
 equation is
 
-T = A[i]
+$$T = A[i]$$
+
 )DOC");
   }
 };
@@ -156,14 +181,14 @@ class WriteToArrayGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("read_from_array");
     grad_op->SetInput("I", Input("I"));
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
@@ -172,14 +197,14 @@ class ReadFromArrayGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("write_to_array");
     grad_op->SetInput("I", Input("I"));
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/top_k_op.cc b/paddle/operators/top_k_op.cc
index 16ae925eb5cab1c05f3bc376972cabadc4367d20..a8ddd729732eb41618ea2788a6eb70a16a5ac70f 100644
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
@@ -41,12 +41,14 @@ class TopkOp : public framework::OperatorWithKernel {
     dims[dims.size() - 1] = k;
     ctx->SetOutputDim("Out", dims);
     ctx->SetOutputDim("Indices", dims);
+    ctx->ShareLoD("X", "Out");
+    ctx->ShareLoD("X", "Indices");
   }
 };
 
 class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TopkOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  TopkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The input of Topk op");
     AddOutput("Out", "(Tensor) The output tensor of Topk op");
diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu
index 7851c71bbe9fe73402968ce14f6db0df523cd6d3..f7bf58e7218cb9b94526cb9346bc5f9aa971038a 100644
--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
 #include "paddle/platform/assert.h"
@@ -283,7 +283,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
     auto* input = ctx.Input<Tensor>("X");
     auto* output = ctx.Output<Tensor>("Out");
     auto* indices = ctx.Output<Tensor>("Indices");
@@ -317,4 +317,4 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>);
diff --git a/paddle/operators/top_k_op.h b/paddle/operators/top_k_op.h
index bc8563717a21bd5b3d8fc87f689657990066957b..e9cd9bbd4d964c28f305fb4ab4c4733ed27ebfff 100644
--- a/paddle/operators/top_k_op.h
+++ b/paddle/operators/top_k_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class TopkKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc
index 94de3d5069017a7ca818e246ad574c4db92d8006..11615d806a61b3525d2ed50f5ea5940e8d61c8f8 100644
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/transpose_op.h"
 
@@ -55,8 +55,7 @@ class TransposeOp : public framework::OperatorWithKernel {
 
 class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TransposeOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  TransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
@@ -71,18 +70,31 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
 Transpose Operator.
 
 The input tensor will be permuted according to the axis values given.
-The op functions similar to how numpy.transpose works in python.
+The op functions is similar to how numpy.transpose works in python.
+
 For example:
- >> input = numpy.arange(6).reshape((2,3))
- >> input
- array([[0, 1, 2],
-        [3, 4, 5]])
- >> axis = [1, 0]
- >> output = input.transpose(axis)
- >> output
- array([[0, 3],
-        [1, 4],
-		[2, 5]])
+
+    .. code-block:: text
+
+      input = numpy.arange(6).reshape((2,3))
+
+      the input is:
+
+      array([[0, 1, 2],
+             [3, 4, 5]])
+
+      given axis is:
+
+      [1, 0]
+
+      output = input.transpose(axis)
+
+      then the output is:
+
+      array([[0, 3],
+             [1, 4],
+             [2, 5]])
+
 So, given a input tensor of shape(N, C, H, W) and the axis is {0, 2, 3, 1},
 the output tensor shape will be (N, H, W, C)
 
@@ -112,8 +124,8 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(transpose, ops::TransposeOp, ops::TransposeOpMaker, transpose_grad,
             ops::TransposeOpGrad);
-REGISTER_OP_CPU_KERNEL(transpose,
-                       ops::TransposeKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     transpose_grad,
-    ops::TransposeGradKernel<paddle::platform::CPUPlace, float>);
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/transpose_op.cu.cc b/paddle/operators/transpose_op.cu.cc
index af3f581462c919bbd2dd1067e536cc638f9c267d..281c4468cc267c659befd238c9a286dd23eaf16d 100644
--- a/paddle/operators/transpose_op.cu.cc
+++ b/paddle/operators/transpose_op.cu.cc
@@ -1,22 +1,23 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/transpose_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(transpose,
-                       ops::TransposeKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    transpose,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     transpose_grad,
-    ops::TransposeGradKernel<paddle::platform::GPUPlace, float>);
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/transpose_op.h b/paddle/operators/transpose_op.h
index e296032f4147f9f8338148f9e4fef100c7cf816f..b9686a2db3f76dbb9b8ebdba5e243f5e5a3c571a 100644
--- a/paddle/operators/transpose_op.h
+++ b/paddle/operators/transpose_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -20,33 +20,33 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
-inline void TransCompute(const int dim, const platform::DeviceContext& dev_ctx,
+template <typename DeviceContext, typename T>
+inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
                          const framework::Tensor& in, framework::Tensor* out,
                          const std::vector<int>& axis) {
   switch (dim) {
     case 1:
-      math::Transpose<Place, T, 1> trans1;
+      math::Transpose<DeviceContext, T, 1> trans1;
       trans1(dev_ctx, in, out, axis);
       break;
     case 2:
-      math::Transpose<Place, T, 2> trans2;
+      math::Transpose<DeviceContext, T, 2> trans2;
       trans2(dev_ctx, in, out, axis);
       break;
     case 3:
-      math::Transpose<Place, T, 3> trans3;
+      math::Transpose<DeviceContext, T, 3> trans3;
       trans3(dev_ctx, in, out, axis);
       break;
     case 4:
-      math::Transpose<Place, T, 4> trans4;
+      math::Transpose<DeviceContext, T, 4> trans4;
       trans4(dev_ctx, in, out, axis);
       break;
     case 5:
-      math::Transpose<Place, T, 5> trans5;
+      math::Transpose<DeviceContext, T, 5> trans5;
       trans5(dev_ctx, in, out, axis);
       break;
     case 6:
-      math::Transpose<Place, T, 6> trans6;
+      math::Transpose<DeviceContext, T, 6> trans6;
       trans6(dev_ctx, in, out, axis);
       break;
     default:
@@ -54,7 +54,7 @@ inline void TransCompute(const int dim, const platform::DeviceContext& dev_ctx,
   }
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class TransposeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -64,12 +64,12 @@ class TransposeKernel : public framework::OpKernel<T> {
 
     std::vector<int> axis = context.Attr<std::vector<int>>("axis");
     int ndims = axis.size();
-    auto& dev_ctx = context.device_context();
-    TransCompute<Place, T>(ndims, dev_ctx, *x, out, axis);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    TransCompute<DeviceContext, T>(ndims, dev_ctx, *x, out, axis);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class TransposeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -88,8 +88,9 @@ class TransposeGradKernel : public framework::OpKernel<T> {
     }
 
     int ndims = axis.size();
-    auto& dev_ctx = context.device_context();
-    TransCompute<Place, T>(ndims, dev_ctx, *out_grad, x_grad, reversed_axis);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad, x_grad,
+                                   reversed_axis);
   }
 };
 
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 7975efc7cf134aaf591385a6866254a9c5f2a0bb..3a314bdb9b058f5dfc61b35795000298c40551e6 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
 
-#include <random>
-#include <type_traits>
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 
@@ -63,18 +63,17 @@ class UniformRandomOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("data_type")),
-        ctx.device_context());
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
   }
 };
 
 class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  UniformRandomOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+  UniformRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Out", "(Tensor) The output tensor of uniform random op");
     AddComment(R"DOC(
@@ -99,8 +98,8 @@ uniform distribution.
                  "Random seed used for generating samples. "
                  "0 means use a seed generated by the system.")
         .SetDefault(0);
-    AddAttr<int>("data_type", "(int, default 5(FP32)) Output tensor data type")
-        .SetDefault(framework::DataType::FP32);
+    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
+        .SetDefault(framework::proto::DataType::FP32);
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu
index 8b20bb8287807aca673817c503fee6db04b55753..719d0872a7cba55dc97c95a73a7f86614ab3f4b7 100644
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
 
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/counting_iterator.h>
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include "paddle/framework/op_registry.h"
@@ -63,6 +63,6 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(uniform_random,
-                       paddle::operators::GPUUniformRandomKernel<float>,
-                       paddle::operators::GPUUniformRandomKernel<double>);
+REGISTER_OP_CUDA_KERNEL(uniform_random,
+                        paddle::operators::GPUUniformRandomKernel<float>,
+                        paddle::operators::GPUUniformRandomKernel<double>);
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..50cee11a7a2d6a2483c6851e888b61475cba1376
--- /dev/null
+++ b/paddle/operators/unpool_op.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/unpool_op.h"
+namespace paddle {
+namespace operators {
+
+class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Unpool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of unpool operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddInput(
+        "Indices",
+        "(Tensor) The input tensor of the indices given out by MaxPool2d. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of unpool operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "(vector), the unpooling window size(height, width) "
+        "of unpooling operator.");
+    AddAttr<std::vector<int>>("strides",
+                              "(vector, default:{1, 1}), "
+                              "strides (height, width) of unpooling operator.")
+        .SetDefault({1, 1});
+    AddAttr<std::vector<int>>("paddings",
+                              "(vector defalut:{0,0}), "
+                              "paddings (height, width) of unpooling operator.")
+        .SetDefault({0, 0});
+    AddAttr<std::string>(
+        "unpooling_type",
+        "(string), unpooling type, can be \"max\" for max-unpooling ")
+        .InEnum({"max"});
+    AddComment(R"DOC(
+Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is:
+$(N, C_{out}, H_{out}, W_{out})$, where
+$$
+H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
+W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+$$
+Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf
+)DOC");
+  }
+};
+
+int OutputSize(int input_size, int ksize, int padding, int stride) {
+  int output_size = (input_size - 1) * stride - 2 * padding + ksize;
+  return output_size;
+}
+
+class UnpoolOp : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input(Indices) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UnpoolOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    auto in_y_dims = ctx->GetInputDim("Indices");
+    std::string unpooling_type =
+        ctx->Attrs().Get<std::string>("unpooling_type");
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
+                   "Unpooling intput must be of 4-dimensional.");
+    PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(
+          OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class UnpoolOpGrad : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
+            ops::UnpoolOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    unpool, ops::UnpoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::UnpoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    unpool_grad,
+    ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9b002e35c434db561114dbbafce2f3f934daaf6a
--- /dev/null
+++ b/paddle/operators/unpool_op.cu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/unpool_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    unpool, ops::UnpoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::UnpoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    unpool_grad,
+    ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee18b118c957c7933890000bbe934e6ffdc9e56f
--- /dev/null
+++ b/paddle/operators/unpool_op.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/unpooling.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class UnpoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    auto* out = context.Output<framework::Tensor>("Out");
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    T* output_data = out->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (output_data) {
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(dev_ctx, out, static_cast<T>(0));
+    }
+    math::Unpool2dMaxFunctor<DeviceContext, T> unpool2d_max_forward;
+    unpool2d_max_forward(dev_ctx, *in_x, *in_y, out);
+  }
+};
+template <typename DeviceContext, typename T>
+class UnpoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, in_x_grad, static_cast<T>(0));
+    }
+    math::Unpool2dMaxGradFunctor<DeviceContext, T> unpool2d_max_backward;
+    unpool2d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/warpctc_op.cc b/paddle/operators/warpctc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd0c5f9957663fb05670269aa6c39a976de7bc9c
--- /dev/null
+++ b/paddle/operators/warpctc_op.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/warpctc_op.h"
+
+namespace paddle {
+namespace operators {
+
+class WarpCTCOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) of WarpCTCOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"),
+                   "Input(Label) of WarpCTCOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("WarpCTCGrad"),
+                   "Output(WarpCTCGrad) of WarpCTCOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Loss"),
+                   "Output(Loss) of WarpCTCOp should not be null.");
+
+    auto logits_dims = ctx->GetInputDim("Logits");
+    int sequence_width =
+        static_cast<int>(framework::product(logits_dims) / logits_dims[0]);
+    int blank = ctx->Attrs().Get<int>("blank");
+    PADDLE_ENFORCE((blank >= 0) && (blank < sequence_width),
+                   "The value of Attr(blank) should be in interval [0, %d).",
+                   sequence_width);
+    // TODO(liuyiqun): it is tricky to set the wrong dimension here.
+    ctx->SetOutputDim("Loss", {logits_dims[0], 1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
+        ctx.device_context());
+  }
+};
+
+class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  WarpCTCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Logits",
+             "(LodTensor, default: LoDTensor<float>), the unscaled "
+             "probabilities of variable-length sequences, which is a 2-D "
+             "Tensor with LoD information. It's shape is "
+             "[Lp, num_classes + 1], where Lp is the sum of all input "
+             "sequences' length and num_classes is the true number of classes "
+             "(not including the blank label).");
+    AddInput("Label",
+             "(LodTensor, default: LoDTensor<int>), the ground truth "
+             "of variable-length sequence, which is a 2-D Tensor with LoD "
+             "information. It is of the shape [Lg, 1], where Lg is th sum of "
+             "all labels' length.");
+    AddOutput("WarpCTCGrad",
+              "(Tensor, default: Tensor<float>), a temporary "
+              "output Tensor to store the gradients of warp-ctc, which is "
+              "computed with loss together in one call. It is a 3-D Tensor of "
+              "the shape [max_sequence_length, batch_size, num_classes + 1].")
+        .AsIntermediate();
+    AddOutput("Loss",
+              "(Tensor, default: Tensor<float>), the Connectionist "
+              "Temporal Classification (CTC) loss, which is a 2-D Tensor of "
+              "the shape [batch_size, 1]");
+    AddAttr<int>("blank",
+                 "(int, default: 0), the blank label of Connectionist "
+                 "Temporal Classification (CTC) loss, which is in the "
+                 "half-opened interval [0, num_classes + 1).")
+        .SetDefault(0);
+    AddAttr<bool>("norm_by_times",
+                  "(bool, default: false), whether to "
+                  "normalize the gradients by the number of time-step, "
+                  "which is also the sequence's length.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+An operator integrating the open-source
+[warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in
+[Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin](
+https://arxiv.org/pdf/1512.02595v1.pdf),
+to compute Connectionist Temporal Classification (CTC) loss.
+It can be aliased as softmax with ctc, since a native softmax activation is
+interated to the warp-ctc library, to to normlize values for each row of the
+input tensor.
+
+More detail of CTC loss can be found by refering to
+[Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with
+Recurrent Neural Networks](
+http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf).
+)DOC");
+  }
+};
+
+class WarpCTCGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("WarpCTCGrad"),
+                   "Input(WarpCTCGrad) of WarpCTCGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
+                   "Output(Logits@GRAD) of WarpCTCGradOp should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("Logits"),
+                      ctx->GetInputDim("Logits"));
+    ctx->ShareLoD("Logits", /*->*/ framework::GradVarName("Logits"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker, warpctc_grad,
+            ops::WarpCTCGradOp);
+REGISTER_OP_CPU_KERNEL(
+    warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    warpctc_grad,
+    ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/warpctc_op.cu.cc b/paddle/operators/warpctc_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d8527ac75fd4964f95198417a213ff6155aac2a
--- /dev/null
+++ b/paddle/operators/warpctc_op.cu.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/warpctc_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    warpctc_grad,
+    ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/warpctc_op.h b/paddle/operators/warpctc_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8aea061c00cc9614db37ed408b6d330ef707d1cf
--- /dev/null
+++ b/paddle/operators/warpctc_op.h
@@ -0,0 +1,229 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence_padding.h"
+#include "paddle/operators/math/sequence_scale.h"
+#include "paddle/platform/dynload/warpctc.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext>
+class WarpCTCFunctor {
+ public:
+  /*
+   * \brief Compute the connectionist temporal classification loss,
+   *        and optionally compute the gradient with respect to the inputs.
+   *
+   * If gradient is nullptr, it only computes the ctc loss,
+   * or computes both ctc loss and gradient.
+   *
+   * \param ctx               execution context of this functor
+   * \param input             batch matrix of input probabilities, in
+   *                          max_sequence_length x num_sequences x
+   *                          sequence_width, (row-major) format
+   * \param gradient          batch matrix of gradient, with the same shape as
+   *                          input.
+   * \param cpu_labels        labels always in CPU memory.
+   * \param cpu_label_lengths length of all labels in CPU memory.
+   * \param cpu_input_lengths length of all sequences in CPU memory.
+   * \param sequence_width    number of possible output symbols.
+   * \param num_sequences     number of sequence.
+   * \param blank             blank label used in ctc loss function.
+   * \param cpu_losss         cost of each sequence in CPU memory.
+   */
+  void operator()(const framework::ExecutionContext& ctx, const float* input,
+                  float* gradient, const int* cpu_labels,
+                  const int* cpu_label_lengths, const int* cpu_input_lengths,
+                  const size_t sequence_width, const size_t num_sequences,
+                  const size_t blank, float* cpu_loss) {
+    // Init warp-ctc options
+    init(ctx, blank);
+
+    // Compute the required workspace size.
+    // There is no memory allocated operations within warp-ctc.
+    size_t workspace_bytes = 0;
+    ctcStatus_t status = platform::dynload::get_workspace_size(
+        cpu_label_lengths, cpu_input_lengths, static_cast<int>(sequence_width),
+        static_cast<int>(num_sequences), options_, &workspace_bytes);
+    PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status,
+                      "warp-ctc [version %d] Error in get_workspace_size: ",
+                      warpctc_version_,
+                      platform::dynload::ctcGetStatusString(status));
+    PADDLE_ENFORCE_GT(workspace_bytes, 0UL,
+                      "Bytes of workspace got by warp-ctc function, "
+                      "get_workspace_size(), should be larger than 0.");
+
+    Tensor workspace;
+    size_t workspace_elements = workspace_bytes / sizeof(float) + 1UL;
+    float* workspace_data = workspace.mutable_data<float>(
+        framework::make_ddim({static_cast<int64_t>(workspace_elements)}),
+        ctx.GetPlace());
+    math::SetConstant<DeviceContext, float>()(
+        ctx.template device_context<DeviceContext>(), &workspace,
+        static_cast<float>(0));
+
+    // compute loss and gradient
+    status = platform::dynload::compute_ctc_loss(
+        input, gradient, cpu_labels, cpu_label_lengths, cpu_input_lengths,
+        static_cast<int>(sequence_width), static_cast<int>(num_sequences),
+        cpu_loss, workspace_data, options_);
+    PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status,
+                      "warp-ctc [version %d] Error in compute_ctc_loss: ",
+                      warpctc_version_,
+                      platform::dynload::ctcGetStatusString(status));
+  }
+
+ protected:
+  void init(const framework::ExecutionContext& ctx, const size_t blank) {
+    warpctc_version_ = platform::dynload::get_warpctc_version();
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+      options_.loc = CTC_GPU;
+      options_.stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                            ctx.device_context())
+                            .stream();
+#else
+      PADDLE_THROW("[warpctc init] GPU is not enabled.");
+#endif
+    } else {
+      options_.loc = CTC_CPU;
+      options_.num_threads = 1;
+    }
+
+    options_.blank_label = blank;
+  }
+
+ private:
+  int warpctc_version_;
+  ctcOptions options_;
+};
+
+template <typename DeviceContext, typename T>
+class WarpCTCKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* logits = ctx.Input<LoDTensor>("Logits");
+    auto* label = ctx.Input<LoDTensor>("Label");
+    auto* warpctc_grad = ctx.Output<Tensor>("WarpCTCGrad");
+    auto* loss = ctx.Output<Tensor>("Loss");
+
+    const size_t level = 0;
+
+    auto logits_lod = framework::ToAbsOffset(logits->lod());
+    auto logits_dims = logits->dims();
+    PADDLE_ENFORCE_EQ(logits_dims[0],
+                      static_cast<int64_t>(logits_lod[level].back()),
+                      "The first dimension of Input(Logits) should be equal to "
+                      "the sum of all sequences' lengths.");
+
+    auto label_lod = framework::ToAbsOffset(label->lod());
+    auto label_dims = label->dims();
+    PADDLE_ENFORCE_EQ(
+        label_dims[0], label->numel(),
+        "The width of each timestep in Input(Label) should be 1.");
+
+    const size_t num_sequences = logits_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(num_sequences, label_lod[level].size() - 1,
+                      "The number of sequences of Input(Logits) should be "
+                      "equal to that of Input(Label).");
+
+    const size_t sequence_width = logits->numel() / logits_dims[0];
+    auto loss_dims =
+        framework::make_ddim({static_cast<int64_t>(num_sequences), 1});
+
+    // warpctc needs sequences data stored in transposed padding format
+    Tensor warpctc_logits;
+    const size_t max_sequence_length =
+        math::MaximumSequenceLength(logits_lod, level);
+    auto warpctc_logits_dims =
+        framework::make_ddim({static_cast<int64_t>(max_sequence_length),
+                              static_cast<int64_t>(num_sequences),
+                              static_cast<int64_t>(sequence_width)});
+    warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
+    math::PaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *logits, warpctc_logits,
+        false);
+    const T* warpctc_logits_data = warpctc_logits.data<T>();
+
+    std::vector<int> warpctc_label_lengths(num_sequences);
+    std::vector<int> warpctc_logits_lengths(num_sequences);
+
+    for (size_t i = 0; i < num_sequences; ++i) {
+      warpctc_label_lengths[i] = label_lod[level][i + 1] - label_lod[level][i];
+      warpctc_logits_lengths[i] =
+          logits_lod[level][i + 1] - logits_lod[level][i];
+    }
+
+    // warpctc computes loss and gradient in one call, gradient data also stored
+    // in batch format
+    T* warpctc_grad_data =
+        warpctc_grad->mutable_data<T>(warpctc_logits.dims(), ctx.GetPlace());
+
+    math::SetConstant<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), warpctc_grad,
+        static_cast<T>(0));
+
+    // warpctc accesses labels in CPU memory
+    Tensor warpctc_label;
+    Copy(*label, platform::CPUPlace(), ctx.device_context(), &warpctc_label);
+    const int* warpctc_label_data = warpctc_label.data<int>();
+    // warpctc stores loss in CPU memory
+    Tensor warpctc_loss;
+    T* warpctc_loss_data =
+        warpctc_loss.mutable_data<T>(loss_dims, platform::CPUPlace());
+
+    const size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
+
+    WarpCTCFunctor<DeviceContext>()(
+        ctx, warpctc_logits_data, warpctc_grad_data, warpctc_label_data,
+        warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
+        sequence_width, num_sequences, blank, warpctc_loss_data);
+
+    // Copy the loss back
+    Copy(warpctc_loss, ctx.GetPlace(), ctx.device_context(), loss);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class WarpCTCGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* warpctc_grad = ctx.Input<Tensor>("WarpCTCGrad");
+    auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
+    const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+
+    logits_grad->mutable_data<T>(ctx.GetPlace());
+    bool norm_by_times = ctx.Attr<bool>("norm_by_times");
+    math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *logits_grad,
+        *warpctc_grad, norm_by_times);
+
+    const T* loss_grad_data = loss_grad->data<T>();
+    math::ScaleLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *logits_grad,
+        loss_grad_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
index dcc59f5ff2ae3a8ca999d72a20cfd5c759987d89..2fdd25dbbe68659f8a0a9da13a87148ed259127a 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <vector>
 #include "paddle/framework/executor.h"
@@ -25,12 +25,12 @@ namespace operators {
 using StepScopeVar = std::vector<framework::Scope *>;
 using LoDTensor = framework::LoDTensor;
 
-constexpr char kStepBlock[] = "step_block";
-constexpr char kCondition[] = "Condition";
-constexpr char kStepScopes[] = "StepScopes";
-constexpr char kParameters[] = "X";
-constexpr char kParamGrads[] = "X@GRAD";
-constexpr char kOutputs[] = "Out";
+static constexpr char kStepBlock[] = "sub_block";
+static constexpr char kCondition[] = "Condition";
+static constexpr char kStepScopes[] = "StepScopes";
+static constexpr char kX[] = "X";
+static constexpr char kXGRAD[] = "X@GRAD";
+static constexpr char kOutputs[] = "Out";
 
 class WhileOp : public framework::OperatorBase {
  public:
@@ -40,13 +40,14 @@ class WhileOp : public framework::OperatorBase {
       : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
     auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
     PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
 
-    framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    framework::Executor executor(dev_place);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
     auto *program = block->Program();
 
     auto step_scopes =
@@ -64,9 +65,9 @@ class WhileOp : public framework::OperatorBase {
 
 class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  WhileOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(kParameters,
+    AddInput(kX,
              "A set of variables, which are required by operators inside the "
              "block of While Op.")
         .AsDuplicable();
@@ -82,8 +83,8 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
               "(StepScopeVar) A vector of local scope, which size equals the "
               "step number of While Op. The i'th scope storages temporary "
               "variables generated in the i'th step.");
-    AddAttr<framework::BlockDescBind *>(kStepBlock,
-                                        "The step block inside WhileOp");
+    AddAttr<framework::BlockDesc *>(kStepBlock,
+                                    "The step block inside WhileOp");
     AddComment(R"DOC(
 )DOC");
   }
@@ -97,11 +98,9 @@ class WhileGradOp : public framework::OperatorBase {
       : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
-    //    PADDLE_ENFORCE(...)
-
-    framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+           const platform::Place &dev_place) const override {
+    framework::Executor executor(dev_place);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
 
     auto *step_scopes =
@@ -122,10 +121,14 @@ class WhileGradOp : public framework::OperatorBase {
       for (size_t i = 0; i < outside_og_names.size(); ++i) {
         auto outside_og_name = outside_og_names[i];
         auto inside_og_name = inside_og_names[i];
-        VLOG(10) << "Linking outside " << outside_og_name << " --> inside "
-                 << inside_og_name;
-        auto &og_outside = detail::Ref(scope.FindVar(outside_og_name));
-        auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name));
+        VLOG(8) << "Linking outside " << outside_og_name << " --> inside "
+                << inside_og_name;
+        auto &og_outside =
+            detail::Ref(scope.FindVar(outside_og_name),
+                        "Cannot find Outside Gradient %s", outside_og_name);
+        auto &og_inside =
+            detail::Ref(cur_scope.Var(inside_og_name),
+                        "Cannot find inside gradient %s", inside_og_name);
         if (og_outside.Type().hash_code() ==
             typeid(framework::LoDTensor).hash_code()) {
           auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
@@ -138,11 +141,11 @@ class WhileGradOp : public framework::OperatorBase {
           auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
           auto &inside_array =
               detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
-          VLOG(10) << outside_og_name << " size = " << outside_array.size();
+          VLOG(8) << outside_og_name << " size = " << outside_array.size();
           inside_array.resize(outside_array.size());
 
           for (size_t j = 0; j < inside_array.size(); ++j) {
-            VLOG(10) << j << " " << outside_array[j].numel();
+            VLOG(8) << j << " " << outside_array[j].numel();
             if (outside_array[j].numel() != 0) {
               inside_array[j].set_lod(outside_array[j].lod());
               inside_array[j].ShareDataWith(outside_array[j]);
@@ -155,12 +158,12 @@ class WhileGradOp : public framework::OperatorBase {
 
       executor.Run(*program, *cur_scope_iter, block->ID(), false);
 
-      auto &pg_names = Outputs(kParamGrads);
-      auto &p_names = Inputs(kParameters);
+      auto &pg_names = Outputs(kXGRAD);
+      auto &p_names = Inputs(kX);
       PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
       for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
         if (pg_names[param_id] == framework::kEmptyVarName) {
-          continue;  // iterator doesn't have gradient
+          continue;  // parameter doesn't have gradient
         }
         auto inside_grad_name = framework::GradVarName(p_names[param_id]);
 
@@ -180,22 +183,26 @@ class WhileGradOp : public framework::OperatorBase {
           if (var->IsType<LoDTensor>()) {
             auto &inside_tensor = var->Get<framework::LoDTensor>();
             framework::AttributeMap attrs;
-            attrs["data_type"] = framework::ToDataType(inside_tensor.type());
+            attrs["dtype"] = framework::ToDataType(inside_tensor.type());
             attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
             attrs["value"] = 0.0f;
 
+            auto var_name = pg_names[param_id];
             auto zero_op = framework::OpRegistry::CreateOp(
-                "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs);
-            zero_op->Run(scope, dev_ctx);
+                "fill_constant", framework::VariableNameMap{},
+                {{"Out", {var_name}}}, attrs);
+            zero_op->Run(scope, dev_place);
+            scope.FindVar(var_name)
+                ->GetMutable<framework::LoDTensor>()
+                ->set_lod(inside_tensor.lod());
           }
         }
 
-        // sum gradient
         auto new_inside_name = cur_scope.Rename(inside_grad_name);
         auto sum_op = framework::OpRegistry::CreateOp(
             "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-            {{"Out", {pg_names[param_id]}}}, {});
-        sum_op->Run(cur_scope, dev_ctx);
+            {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+        sum_op->Run(cur_scope, dev_place);
         cur_scope.Rename(new_inside_name, inside_grad_name);
       }
     }
@@ -207,36 +214,55 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  virtual std::unique_ptr<framework::OpDescBind> Apply() const {
-    auto *grad = new framework::OpDescBind();
-    grad->SetType("while_grad");
-    grad->SetInput(kParameters, Input(kParameters));
-    grad->SetOutput(
-        framework::GradVarName(kParameters),
-        InputGrad(kParameters, /*do not drop empty gradient*/ false));
-    grad->SetInput(kOutputs, Output(kOutputs));
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *while_grad = new framework::OpDesc();
+    while_grad->SetType("while_grad");
+    while_grad->SetInput(kX, Input(kX));
+    while_grad->SetInput(kOutputs, Output(kOutputs));
+    while_grad->SetInput(kStepScopes, Output(kStepScopes));
+
+    auto *grad_block = this->grad_block_[0];
+    auto *fwd_block = grad_block->ParentBlock();
+
+    // Not all of IGs will be generated by inner gradient operators of while op.
+    // Ignore IGs that is not generated by the inside block.
+    std::unordered_set<std::string> inner_op_outputs;
+    for (const auto *op : grad_block->AllOps()) {
+      for (auto &oname : op->OutputArgumentNames()) {
+        inner_op_outputs.insert(oname);
+      }
+    }
+    auto igs = InputGrad(kX, /*do not drop empty gradient*/ false);
+    for (auto &each_ig : igs) {
+      if (inner_op_outputs.find(each_ig) == inner_op_outputs.end()) {
+        VLOG(8) << "Ignore " << each_ig;
+        each_ig = framework::kEmptyVarName;
+      }
+    }
+    while_grad->SetOutput(framework::GradVarName(kX), igs);
 
     // OG should be re-calculated by step blocks, since many outputs of while op
     // do not need to calculate gradients.
     std::unordered_set<std::string> block_ins;
-    {
-      for (auto &p : Input(kParameters)) {
-        block_ins.insert(p);
-      }
-      for (auto &o : Output(kOutputs)) {
-        block_ins.insert(o);
-      }
+    block_ins.reserve(Input(kX).size() + Output(kOutputs).size());
+    for (auto &p : Input(kX)) {
+      block_ins.insert(p);
+    }
+    for (auto &o : Output(kOutputs)) {
+      block_ins.insert(o);
     }
     std::unordered_set<std::string> extra_inputs;
-    for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) {
-      for (auto &input_name : grad_block_[0]->Op(i)->InputArgumentNames()) {
-        if (block_ins.find(input_name) != block_ins.end()) {
+    for (const auto *op : grad_block->AllOps()) {
+      for (auto &input_name : op->InputArgumentNames()) {
+        // If the input of Op has been recorded or is generated by the forward
+        // block, do not make it as input again.
+        if (block_ins.find(input_name) != block_ins.end() ||
+            fwd_block->FindVar(input_name) != nullptr) {
           continue;
         }
         extra_inputs.insert(input_name);
       }
-
-      for (auto &output_name : grad_block_[0]->Op(i)->OutputArgumentNames()) {
+      for (auto &output_name : op->OutputArgumentNames()) {
         block_ins.insert(output_name);
       }
     }
@@ -245,24 +271,24 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     extra_inputs_list.resize(extra_inputs.size());
     std::copy(extra_inputs.begin(), extra_inputs.end(),
               extra_inputs_list.begin());
-    grad->SetInput(framework::GradVarName(kOutputs), extra_inputs_list);
-    grad->SetInput(kStepScopes, Output(kStepScopes));
-    grad->SetAttrMap(this->Attrs());
-    grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
+    while_grad->SetInput(framework::GradVarName(kOutputs), extra_inputs_list);
+
+    while_grad->SetAttrMap(this->Attrs());
+    while_grad->SetBlockAttr(kStepBlock, *grad_block);
     // record the original output gradient names, since the gradient name of
     // while operator could be renamed.
-    grad->SetAttr("original_output_grad", extra_inputs_list);
+    while_grad->SetAttr("original_output_grad", extra_inputs_list);
 
-    return std::unique_ptr<framework::OpDescBind>(grad);
+    return std::unique_ptr<framework::OpDesc>(while_grad);
   }
 };
 
 class WhileGradOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind &op_desc,
-                  framework::BlockDescBind *block) const override {
-    auto p_names = op_desc.Input(kParameters);
-    auto pg_names = op_desc.Output(framework::GradVarName(kParameters));
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto p_names = op_desc.Input(kX);
+    auto pg_names = op_desc.Output(framework::GradVarName(kX));
 
     for (size_t i = 0; i < p_names.size(); ++i) {
       auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
@@ -280,28 +306,28 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
 class WhileGradOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {
-    ctx->HasInputs(kParameters);
-    ctx->HasOutputs(framework::GradVarName(kParameters));
+    ctx->HasInputs(kX);
+    ctx->HasOutputs(framework::GradVarName(kX));
     ctx->HasInputs(kOutputs);
     ctx->HasInputs(framework::GradVarName(kOutputs));
 
-    auto p_names = ctx->Inputs(kParameters);
-    auto pg_names = ctx->Outputs(kParamGrads);
-    auto dims = ctx->GetInputsDim(kParameters);
-    auto var_types = ctx->GetInputsVarType(kParameters);
+    auto p_names = ctx->Inputs(kX);
+    auto pg_names = ctx->Outputs(kXGRAD);
+    auto var_types = ctx->GetInputsVarType(kX);
     std::vector<std::string> names_to_set;
     std::vector<framework::DDim> dims_to_set;
     for (size_t i = 0; i < p_names.size(); ++i) {
       if (pg_names[i] == framework::kEmptyVarName) {
         continue;
       }
-      if (var_types[i] == framework::VarDesc::LOD_TENSOR) {
+      auto dims = ctx->GetInputsElementDim(kX, i);
+      if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) {
         names_to_set.push_back(pg_names[i]);
-        dims_to_set.push_back(dims[i]);
-      } else if (var_types[i] == framework::VarDesc::LOD_TENSOR_ARRAY) {
+        dims_to_set.push_back(dims);
+      } else if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR_ARRAY) {
         // not sure how to set the dim of LOD_TENSOR_ARRAY
         names_to_set.push_back(pg_names[i]);
-        dims_to_set.push_back(dims[i]);
+        dims_to_set.push_back(dims);
       }
     }
     ctx->SetDims(names_to_set, dims_to_set);
diff --git a/paddle/optimizer/adadelta_optimizer.cc b/paddle/optimizer/adadelta_optimizer.cc
index 5cc7c47d4486c3d149c37fd6e312780f3d44eda8..8ca048257e53af21f327ee1b8f24b9e2e5d54060 100644
--- a/paddle/optimizer/adadelta_optimizer.cc
+++ b/paddle/optimizer/adadelta_optimizer.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "adadelta_optimizer.h"
 #include <algorithm>
diff --git a/paddle/optimizer/adadelta_optimizer.h b/paddle/optimizer/adadelta_optimizer.h
index 6aab1ad553b15ebbd2d04c9323c5e56e1b8f60f5..48f1ae175041cc1cfa11787c44317dd173430928 100644
--- a/paddle/optimizer/adadelta_optimizer.h
+++ b/paddle/optimizer/adadelta_optimizer.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/optimizer/adagrad_optimizer.cc b/paddle/optimizer/adagrad_optimizer.cc
index c981996bab1b2e7ae5d6e2d858a73efde12e32f3..c6d39a366adcd40e7298e1f1bed48c15f517a217 100644
--- a/paddle/optimizer/adagrad_optimizer.cc
+++ b/paddle/optimizer/adagrad_optimizer.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <cmath>
 
diff --git a/paddle/optimizer/adagrad_optimizer.h b/paddle/optimizer/adagrad_optimizer.h
index 447b7c7547d5bad7436df6f3b3582b4a219f08c8..b0cff061f5c311d2356bab4965ec65c5eca3dbb8 100644
--- a/paddle/optimizer/adagrad_optimizer.h
+++ b/paddle/optimizer/adagrad_optimizer.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/optimizer/adam_optimizer.cc b/paddle/optimizer/adam_optimizer.cc
index 6dc2d749708d0e2a7f36734d89eec30d4576842e..8a384b59c47df321438e3a3bc245a2398a7ef6d1 100644
--- a/paddle/optimizer/adam_optimizer.cc
+++ b/paddle/optimizer/adam_optimizer.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "adam_optimizer.h"
 #include <cmath>
diff --git a/paddle/optimizer/adam_optimizer.h b/paddle/optimizer/adam_optimizer.h
index 37ab53afc37a5f749a2909de12c7871ed926583f..7df40064df31536cead9eba4ce827452cf0dce38 100644
--- a/paddle/optimizer/adam_optimizer.h
+++ b/paddle/optimizer/adam_optimizer.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/optimizer/lr_policy.h b/paddle/optimizer/lr_policy.h
index bbb1ee48214cecdc6b6cd2a400cc9d12d5e8b64a..9a44a776f2b032bc2c3452a739caf0994e25891b 100644
--- a/paddle/optimizer/lr_policy.h
+++ b/paddle/optimizer/lr_policy.h
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #pragma once
 
 #include <algorithm>
diff --git a/paddle/optimizer/optimizer.cc b/paddle/optimizer/optimizer.cc
index faa23764522cef03bae1359adbf58d10ee7809ac..3af44484363654befe85c8511d7c0791d5a33ab8 100644
--- a/paddle/optimizer/optimizer.cc
+++ b/paddle/optimizer/optimizer.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "optimizer.h"
 #include <glog/logging.h>
diff --git a/paddle/optimizer/optimizer.h b/paddle/optimizer/optimizer.h
index e6fa12a4d250ccb078358704b0131942ea6ab039..516e612167f557f043d3fe29789bec4b73405755 100644
--- a/paddle/optimizer/optimizer.h
+++ b/paddle/optimizer/optimizer.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/optimizer/parameter_optimizer.cc b/paddle/optimizer/parameter_optimizer.cc
index da92c2d01cc2a27d1fadd51a338d23b01e0cb0bc..1603e5fdc8a69d8b117f02b9c950a5374625c632 100644
--- a/paddle/optimizer/parameter_optimizer.cc
+++ b/paddle/optimizer/parameter_optimizer.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <glog/logging.h>
 #include "adadelta_optimizer.h"
diff --git a/paddle/optimizer/parameter_optimizer.h b/paddle/optimizer/parameter_optimizer.h
index 99d0416e751c4ca6695d6ed77396e18d48fc86b8..1f501c49e1d7a8018f3274ae1128ca230b9298bb 100644
--- a/paddle/optimizer/parameter_optimizer.h
+++ b/paddle/optimizer/parameter_optimizer.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/optimizer/parameter_optimizer_test.cc b/paddle/optimizer/parameter_optimizer_test.cc
index f29e5317120642e3790a6f6c1976bdda67093a0c..795d2de1d65b3a25c312d0c4e31e0105922838df 100644
--- a/paddle/optimizer/parameter_optimizer_test.cc
+++ b/paddle/optimizer/parameter_optimizer_test.cc
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 /*
   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
@@ -127,8 +140,3 @@ TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
 TEST_F(OptimizerTest, TestUpdate) { TestUpdate(); }
 
 TEST_F(OptimizerTest, TestCheckPoint) { TestCheckPoint(); }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/optimizer/serialization_test.cc b/paddle/optimizer/serialization_test.cc
index 4c416f55ee0bd70f9ec6e288b08a5399d8b2bf39..0f1b14eec135ab37a599965965e2e9d8bb65b90c 100644
--- a/paddle/optimizer/serialization_test.cc
+++ b/paddle/optimizer/serialization_test.cc
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 /*
   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
@@ -46,8 +59,3 @@ TEST(TensorToProto, Case2) {
     EXPECT_EQ(t1[i], t[i]);
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/optimizer/sgd_optimizer.cc b/paddle/optimizer/sgd_optimizer.cc
index c150144ac24b8375d08691a98be680b6bf5d1e7f..ee80f543fc4c1d87ba56540fc581816df8a39793 100644
--- a/paddle/optimizer/sgd_optimizer.cc
+++ b/paddle/optimizer/sgd_optimizer.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "sgd_optimizer.h"
 #include "serialization.h"
diff --git a/paddle/optimizer/sgd_optimizer.h b/paddle/optimizer/sgd_optimizer.h
index 0b1da0aa27d98e8d6a8d9fd7a1ebe355acb2a1f4..16a4df9973ef6a599a8ecee6be5903f13799771c 100644
--- a/paddle/optimizer/sgd_optimizer.h
+++ b/paddle/optimizer/sgd_optimizer.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/optimizer/tensor.h b/paddle/optimizer/tensor.h
index 86fa625e01b981f0377bd699d191fc865ee89784..e999e9bda129a85306f73afc75f3936fefb4c006 100644
--- a/paddle/optimizer/tensor.h
+++ b/paddle/optimizer/tensor.h
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #pragma once
 /**
  * @brief tensor used by optimizer
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index f157188a4f736319ea187052b90a17f8be9e9edb..5b0c52a30dfbc34f0db4a03366e31da54b86c9fc 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -38,7 +38,7 @@ public:
     real torch_learningRate = optConfig_.learning_method() == "torch_momentum"
                                   ? 1.0 - paraConfig.momentum()
                                   : 1.0;
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
     sgdUpdate(learningRate_ * paraConfig.learning_rate() *
                   (firstTime_ ? 1.0 : torch_learningRate),
               paraConfig.momentum(),
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
index 1898598e49652a2829e57329bab6017304cec662..d60cb363830ff26a1f5054fb4cebf37afdfd1d03 100644
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -30,7 +30,7 @@ void sgdUpdateCpu(real learningRate,
                   const real* grad,
                   real* momentumVec) {
   decayRate *= learningRate;
-#ifdef PADDLE_USE_MKLML
+#ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
 #endif
   for (size_t i = 0; i < size; ++i) {
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index bd86a9fe268c277065cd450f91b544def6c4d32f..44f6d85cd1510f309595ca711de2e0f767219580 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,27 +1,41 @@
-cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog)
+if(WITH_GPU)
+  cc_library(enforce SRCS enforce.cc DEPS nccl)
+else()
+  cc_library(enforce SRCS enforce.cc)
+endif()
+cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
+
+cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog enforce)
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
-nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog)
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
 
-cc_library(place SRCS place.cc)
+cc_library(place SRCS place.cc DEPS enforce)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 add_subdirectory(dynload)
 
-cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece)
-
 IF(WITH_GPU)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
 ELSE()
     set(GPU_CTX_DEPS)
 ENDIF()
 
+IF(WITH_MKLDNN)
+    set(MKLDNN_CTX_DEPS mkldnn)
+ELSE()
+    set(MKLDNN_CTX_DEPS)
+ENDIF()
+
 # memcpy deoends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
-    system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
-nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
+    system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
 nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
+
+cc_library(profiler SRCS profiler.cc DEPS device_context)
+cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
diff --git a/paddle/platform/call_once.h b/paddle/platform/call_once.h
index d9f49527dcf150fcb35d3af512088f75dec0b5c6..00337a7f051758559a0f8012d8c78dbe8e3457a6 100644
--- a/paddle/platform/call_once.h
+++ b/paddle/platform/call_once.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/platform/cpu_info_test.cc b/paddle/platform/cpu_info_test.cc
index 8fb195aa7c0a41b7417ff5cf63394046e9c72267..1bfe62c1fb667e17d7383cf0a1b2632043c72743 100644
--- a/paddle/platform/cpu_info_test.cc
+++ b/paddle/platform/cpu_info_test.cc
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #include "paddle/platform/cpu_info.h"
 #include "paddle/string/printf.h"
 
diff --git a/paddle/platform/cuda_helper.h b/paddle/platform/cuda_helper.h
index a7d99cde106a0a66f122a8c43f49717c03e60dec..376bb0e6887c797c3c1019e92f738a62d01a9c51 100644
--- a/paddle/platform/cuda_helper.h
+++ b/paddle/platform/cuda_helper.h
@@ -31,6 +31,16 @@ constexpr int PADDLE_CUDA_NUM_THREADS = 512;
 
 // For atomicAdd.
 USE_CUDA_ATOMIC(Add, float);
+USE_CUDA_ATOMIC(Add, int);
+USE_CUDA_ATOMIC(Add, unsigned int);
+USE_CUDA_ATOMIC(Add, unsigned long long int);
+
+CUDA_ATOMIC_WRAPPER(Add, int64_t) {
+  static_assert(sizeof(int64_t) == sizeof(long long int),
+                "long long should be int64");
+  return CudaAtomicAdd(reinterpret_cast<unsigned long long int*>(address),
+                       static_cast<unsigned long long int>(val));
+}
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
 USE_CUDA_ATOMIC(Add, double);
diff --git a/paddle/platform/cuda_profiler.h b/paddle/platform/cuda_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..67d5f626d41c0fae280062533d6c1d1074341164
--- /dev/null
+++ b/paddle/platform/cuda_profiler.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cuda_profiler_api.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace paddle {
+namespace platform {
+
+void CudaProfilerInit(std::string output_file, std::string output_mode,
+                      std::string config_file) {
+  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
+  cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
+  PADDLE_ENFORCE(
+      cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
+}
+
+void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); }
+
+void CudaProfilerStop() { PADDLE_ENFORCE(cudaProfilerStop()); }
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cudnn_helper.h b/paddle/platform/cudnn_helper.h
index c5d8a6066ef3becb601344590f977a38c2af0a63..80a4c9bb4bbcd03cf849d86118db4e502382f031 100644
--- a/paddle/platform/cudnn_helper.h
+++ b/paddle/platform/cudnn_helper.h
@@ -116,7 +116,7 @@ inline cudnnTensorFormat_t GetCudnnTensorFormat(
     case DataLayout::kNCHW:
       return CUDNN_TENSOR_NCHW;
     case DataLayout::kNCDHW:
-      return CUDNN_TENSOR_NCHW;  // TODO(chengduoZH) : add CUDNN_TENSOR_NCDHW
+      return CUDNN_TENSOR_NCHW;  // NOTE: cudnn treat NdTensor as the same
     default:
       PADDLE_THROW("Unknown cudnn equivalent for order");
   }
@@ -143,7 +143,7 @@ class ScopedTensorDescriptor {
       strides[i] = dims[i + 1] * strides[i + 1];
     }
     // Update tensor descriptor dims setting if groups > 1
-    // FIXME(typhoonzero): Assume using NCHW or NCDHW order
+    // NOTE: Assume using NCHW or NCDHW order
     std::vector<int> dims_with_group(dims.begin(), dims.end());  // copy
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
@@ -186,7 +186,6 @@ class ScopedFilterDescriptor {
     // width of the filter.
     std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
     if (groups > 1) {
-      // M /= groups
       kernel_with_group[0] /= groups;
       // NOTE: input filter(C) of the filter is already asserted to be C/groups.
     }
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 7afcdfce9371e29aad968a1729931173fb2309b5..9d9348079a0179418ab1a1474cfd8b69136f26b2 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -15,17 +15,46 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-template <>
-Eigen::DefaultDevice* DeviceContext::GetEigenDevice<
-    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return reinterpret_cast<const CPUDeviceContext*>(this)->eigen_device();
+DeviceContextPool* DeviceContextPool::pool = nullptr;
+
+const platform::DeviceContext* DeviceContextPool::Get(
+    const platform::Place& place) {
+  auto it = device_contexts_.find(place);
+  if (it == device_contexts_.end()) {
+    PADDLE_THROW(
+        "'Place' is not supported, Please re-compile with WITH_GPU "
+        "option");
+  }
+  return it->second;
+}
+
+DeviceContextPool::DeviceContextPool(
+    const std::vector<platform::Place>& places) {
+  PADDLE_ENFORCE_GT(places.size(), 0);
+  for (size_t i = 0; i < places.size(); i++) {
+    if (platform::is_cpu_place(places[i])) {
+      device_contexts_.emplace(places[i],
+                               new platform::CPUDeviceContext(
+                                   boost::get<platform::CPUPlace>(places[i])));
+    } else if (platform::is_gpu_place(places[i])) {
+#ifdef PADDLE_WITH_CUDA
+      device_contexts_.emplace(places[i],
+                               new platform::CUDADeviceContext(
+                                   boost::get<platform::CUDAPlace>(places[i])));
+#else
+      PADDLE_THROW(
+          "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
+          "option");
+#endif
+    }
+  }
 }
 
 CPUDeviceContext::CPUDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
 
-CPUDeviceContext::CPUDeviceContext(CPUPlace place) {
+CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) {
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
 
@@ -33,16 +62,10 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
   return eigen_device_.get();
 }
 
-Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
+Place CPUDeviceContext::GetPlace() const { return place_; }
 
 #ifdef PADDLE_WITH_CUDA
 
-template <>
-Eigen::GpuDevice*
-DeviceContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
-}
-
 class EigenCudaStreamDevice : public Eigen::StreamInterface {
  public:
   EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
@@ -50,7 +73,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   }
   ~EigenCudaStreamDevice() override {}
 
-  void Reinitialize(const cudaStream_t* cuda_stream, GPUPlace place) {
+  void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) {
     stream_ = cuda_stream;
     place_ = place;
     device_prop_ = &Eigen::m_deviceProperties[place.device];
@@ -89,14 +112,14 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   }
 
  private:
-  GPUPlace place_;
+  CUDAPlace place_;
   const cudaStream_t* stream_;         // not owned;
   const cudaDeviceProp* device_prop_;  // not owned;
   mutable void* scratch_;
   mutable unsigned int* semaphore_;
 };
 
-CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
+CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
   SetDeviceId(place_.device);
   PADDLE_ENFORCE(cudaStreamCreate(&stream_));
   eigen_stream_.reset(new EigenCudaStreamDevice());
@@ -104,15 +127,21 @@ CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
   eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
   PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
   PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
-  PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
-  PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
+  if (dynload::HasCUDNN()) {
+    PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
+    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
+  } else {
+    cudnn_handle_ = nullptr;
+  }
 }
 
 CUDADeviceContext::~CUDADeviceContext() {
   SetDeviceId(place_.device);
   Wait();
   PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
-  PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
+  if (cudnn_handle_ != nullptr) {
+    PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
+  }
   eigen_stream_.reset();
   eigen_device_.reset();
   PADDLE_ENFORCE(cudaStreamDestroy(stream_));
@@ -122,10 +151,6 @@ Place CUDADeviceContext::GetPlace() const { return place_; }
 
 void CUDADeviceContext::Wait() const {
   PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-}
-
-void CUDADeviceContext::Finish() const {
-  Wait();
   PADDLE_ENFORCE(cudaGetLastError());
 }
 
@@ -143,5 +168,69 @@ cudaStream_t CUDADeviceContext::stream() const { return stream_; }
 
 #endif
 
+#ifdef PADDLE_WITH_MKLDNN
+MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
+    : CPUDeviceContext(place), ready_(false) {
+  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
+}
+
+template <typename T>
+void MKLDNNDeviceContext::AddElement(const std::string& op_key,
+                                     const T& value) {
+  if (GetElement<T>(op_key)) {
+    return;
+  }
+  GetElementPool<T>().emplace(op_key, std::move(value));
+}
+
+template <typename T>
+const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const {
+  auto it = GetElementPool<T>().find(op_key);
+  return it == GetElementPool<T>().end() ? nullptr : it->second;
+}
+
+template <>
+const std::unordered_map<const std::string, const MKLDNNMemoryPtr,
+                         std::hash<std::string>>&
+MKLDNNDeviceContext::GetElementPool<MKLDNNMemoryPtr>() const {
+  return memory_pool_;
+}
+
+template <>
+const std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
+                         std::hash<std::string>>&
+MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitivePtr>() const {
+  return primitive_pool_;
+}
+
+template <>
+const std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
+                         std::hash<std::string>>&
+MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitiveDescPtr>() const {
+  return primitive_desc_pool_;
+}
+
+void MKLDNNDeviceContext::Execute(bool block) {
+  if (pipeline_.empty()) {
+    return;
+  }
+  ResetStream();
+  stream_->submit(pipeline_).wait(block);
+  ready_ = false;
+  pipeline_.clear();
+}
+
+void MKLDNNDeviceContext::ResetStream() {
+  if (ready_) {
+    return;
+  }
+  // TODO(TJ): change me when mkldnn have specific method to reset this state
+  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+  ready_ = true;
+}
+
+#endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 526d089e35da9c9f89a3852095ad3a4c82d4d85d..9826a642768a3346110a7872a726aba15eac1fb0 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -11,8 +11,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/place.h"
+#include <memory>
+#include <unordered_map>
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/platform/dynload/cublas.h"
@@ -20,34 +20,26 @@ limitations under the License. */
 #include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
-#include <memory>
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/platform/mkldnn_helper.h"
+#endif
+
+#include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
+#include "glog/logging.h"
+
 namespace paddle {
 namespace platform {
 
-template <typename T>
-struct EigenDeviceConverter;
-
-template <>
-struct EigenDeviceConverter<platform::CPUPlace> {
-  using EigenDeviceType = Eigen::DefaultDevice;
-};
-
 class DeviceContext {
  public:
   virtual ~DeviceContext() {}
   virtual Place GetPlace() const = 0;
 
-  template <typename PlaceType,
-            typename DeviceType =
-                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
-  DeviceType* GetEigenDevice() const;
-
   virtual void Wait() const {}
-
-  virtual void Finish() const {}
 };
 
 class CPUDeviceContext : public DeviceContext {
@@ -60,28 +52,30 @@ class CPUDeviceContext : public DeviceContext {
   Place GetPlace() const override;
 
  private:
+  CPUPlace place_;
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };
 
-#ifdef PADDLE_WITH_CUDA
+template <typename Place>
+struct DefaultDeviceContextType;
+
 template <>
-struct EigenDeviceConverter<platform::GPUPlace> {
-  using EigenDeviceType = Eigen::GpuDevice;
+struct DefaultDeviceContextType<platform::CPUPlace> {
+  using TYPE = CPUDeviceContext;
 };
 
+#ifdef PADDLE_WITH_CUDA
+
 class EigenCudaStreamDevice;
 
 class CUDADeviceContext : public DeviceContext {
  public:
-  explicit CUDADeviceContext(GPUPlace place);
+  explicit CUDADeviceContext(CUDAPlace place);
   virtual ~CUDADeviceContext();
 
   /*! \brief  Wait for all operations completion in the stream. */
   void Wait() const override;
 
-  /*! \brief  Check potential errors for the cuda kernel calls. */
-  void Finish() const override;
-
   /*! \brief  Return place in the device context. */
   Place GetPlace() const override;
 
@@ -98,7 +92,7 @@ class CUDADeviceContext : public DeviceContext {
   cudaStream_t stream() const;
 
  private:
-  GPUPlace place_;
+  CUDAPlace place_;
 
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;
   std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
@@ -108,7 +102,109 @@ class CUDADeviceContext : public DeviceContext {
   cublasHandle_t cublas_handle_;
 };
 
+template <>
+struct DefaultDeviceContextType<platform::CUDAPlace> {
+  using TYPE = CUDADeviceContext;
+};
+
 #endif
 
+#ifdef PADDLE_WITH_MKLDNN
+class MKLDNNDeviceContext : public CPUDeviceContext {
+ public:
+  explicit MKLDNNDeviceContext(CPUPlace place);
+
+  /* \brief  Add new element: memory, primitive or primitive desc */
+  template <typename T>
+  void AddElement(const std::string& op_key, const T& value);
+
+  /* \brief  Get existed element: memory, primitive or primitive desc */
+  template <typename T>
+  const T& GetElement(const std::string& op_key) const;
+
+  /* \brief  Get element pool: memory, primitive or primitive desc pool */
+  template <typename T>
+  const std::unordered_map<const std::string, const T, std::hash<std::string>>&
+  GetElementPool() const;
+
+  /* \brief  Get the active engine */
+  const MKLDNNEngine& engine() const { return *engine_; }
+
+  /* \brief  Submit primitive to pipeline */
+  void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); }
+
+  /*! \brief  Execute all submitted primitives in pipeline */
+  void Execute(bool block = true);
+
+ protected:
+  /*! \brief  Reset the stream to prepare next exectue */
+  void ResetStream();
+
+ private:
+  std::unordered_map<const std::string, const MKLDNNMemoryPtr,
+                     std::hash<std::string>>
+      memory_pool_;
+  std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
+                     std::hash<std::string>>
+      primitive_pool_;
+  std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
+                     std::hash<std::string>>
+      primitive_desc_pool_;
+  std::vector<MKLDNNPrimitive> pipeline_;
+  MKLDNNStreamPtr stream_;
+  MKLDNNEnginePtr engine_;
+  bool ready_;
+};
+#endif
+
+/*! \brief device context pool singleton */
+class DeviceContextPool {
+ public:
+  explicit DeviceContextPool(const std::vector<platform::Place>& places);
+
+  static DeviceContextPool& Instance() {
+    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
+    return *pool;
+  }
+
+  /*! \brief  Create should only called by Init function */
+  static DeviceContextPool& Init(const std::vector<platform::Place>& places) {
+    if (pool == nullptr) {
+      pool = new DeviceContextPool(places);
+    }
+    return *pool;
+  }
+
+  /*! \brief  Return handle of single device context. */
+  const platform::DeviceContext* Get(const platform::Place& place);
+
+  template <typename Place>
+  const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
+      const Place& place) {
+    return reinterpret_cast<
+        const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place));
+  }
+
+  size_t size() const { return device_contexts_.size(); }
+
+ private:
+  static DeviceContextPool* pool;
+  constexpr static int LEFT_SHIFT = 8;
+  struct Hash {
+    std::hash<int> hash_;
+    size_t operator()(const platform::Place& place) const {
+      int pre_hash = place.which() << LEFT_SHIFT;
+      if (platform::is_gpu_place(place)) {
+        pre_hash += boost::get<platform::CUDAPlace>(place).GetDeviceId();
+      }
+      return hash_(pre_hash);
+    }
+  };
+  std::unordered_map<const platform::Place, const platform::DeviceContext*,
+                     Hash>
+      device_contexts_;
+  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
+};
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cu
similarity index 51%
rename from paddle/platform/device_context_test.cc
rename to paddle/platform/device_context_test.cu
index 8bf5174c4a5579f6f5602dd38e5a87ed3ef444a7..767fe9b24a5d51630397a864c98928de52d21f31 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cu
@@ -12,19 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/device_context.h"
 #include "gtest/gtest.h"
+#include "paddle/platform/device_context.h"
+
+#include "glog/logging.h"
 
 TEST(Device, Init) {
   using paddle::platform::DeviceContext;
   using paddle::platform::CUDADeviceContext;
-  using paddle::platform::GPUPlace;
+  using paddle::platform::CUDAPlace;
 
   int count = paddle::platform::GetCUDADeviceCount();
   for (int i = 0; i < count; i++) {
-    DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
-    Eigen::GpuDevice* gpu_device =
-        device_context->template GetEigenDevice<GPUPlace>();
+    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
+    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
     ASSERT_NE(nullptr, gpu_device);
     delete device_context;
   }
@@ -32,11 +33,11 @@ TEST(Device, Init) {
 
 TEST(Device, CUDADeviceContext) {
   using paddle::platform::CUDADeviceContext;
-  using paddle::platform::GPUPlace;
+  using paddle::platform::CUDAPlace;
 
   int count = paddle::platform::GetCUDADeviceCount();
   for (int i = 0; i < count; i++) {
-    CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
+    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
     Eigen::GpuDevice* gpu_device = device_context->eigen_device();
     ASSERT_NE(nullptr, gpu_device);
     cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
@@ -47,3 +48,39 @@ TEST(Device, CUDADeviceContext) {
     delete device_context;
   }
 }
+
+TEST(Device, DeviceContextPool) {
+  using paddle::platform::DeviceContextPool;
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::Place;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::CUDAPlace;
+
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  auto cpu_dev_ctx1 = pool.Get(CPUPlace());
+  auto cpu_dev_ctx2 = pool.Get(CPUPlace());
+  ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
+
+  std::vector<Place> gpu_places;
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    auto dev_ctx = pool.Get(CUDAPlace(i));
+    ASSERT_NE(dev_ctx, nullptr);
+  }
+}
+
+int main(int argc, char** argv) {
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::CUDAPlace(i));
+  }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Init(places);
+
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
index bb3fec1be9e811c26cc6851314e960e96fc366b3..cf2081b434961c17c1b65509909699788d2b9ad9 100644
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
@@ -1,3 +1,4 @@
-cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
+cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc
         DEPS dynamic_loader nccl)
+cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
diff --git a/paddle/platform/dynload/cublas.cc b/paddle/platform/dynload/cublas.cc
index 9cd2a1f565526f8dc45932ba6168f4e25c6ad238..6aca716657c5f629ce40e88c04dc25fbbc9b4f36 100644
--- a/paddle/platform/dynload/cublas.cc
+++ b/paddle/platform/dynload/cublas.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/platform/dynload/cublas.h>
+#include "paddle/platform/dynload/cublas.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc
index d3e4cb567d71b987724366b6a0896f5df0eb6055..701f6240fef2e9df25472c0caf6177e7f1964cfd 100644
--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/platform/dynload/cudnn.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/platform/dynload/cudnn.h>
+#include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
@@ -37,6 +38,25 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_R7
+CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
+#endif
+
+#ifdef PADDLE_USE_DSO
+bool HasCUDNN() {
+  std::call_once(cudnn_dso_flag, GetCUDNNDsoHandle, &cudnn_dso_handle);
+  return cudnn_dso_handle != nullptr;
+}
+
+void EnforceCUDNNLoaded(const char* fn_name) {
+  PADDLE_ENFORCE(cudnn_dso_handle != nullptr,
+                 "Cannot load cudnn shared library. Cannot invoke method %s",
+                 fn_name);
+}
+#else
+bool HasCUDNN() { return true; }
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h
index b2d69da93bcd4a5c8e694a18ca648ddc4bd947af..b92634794947f6979255d6241ff3a333f3771bfb 100644
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
@@ -25,17 +25,20 @@ namespace dynload {
 
 extern std::once_flag cudnn_dso_flag;
 extern void* cudnn_dso_handle;
+extern bool HasCUDNN();
 
 #ifdef PADDLE_USE_DSO
 
+extern void EnforceCUDNNLoaded(const char* fn_name);
 #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                    \
   struct DynLoad__##__name {                                       \
     template <typename... Args>                                    \
     auto operator()(Args... args) -> decltype(__name(args...)) {   \
       using cudnn_func = decltype(__name(args...)) (*)(Args...);   \
       std::call_once(cudnn_dso_flag,                               \
-                     paddle::platform::dynload::GetCudnnDsoHandle, \
+                     paddle::platform::dynload::GetCUDNNDsoHandle, \
                      &cudnn_dso_handle);                           \
+      EnforceCUDNNLoaded(#__name);                                 \
       void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
       return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
     }                                                              \
@@ -135,6 +138,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
+#if CUDNN_VERSION >= 7001
+#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
+  __macro(cudnnSetConvolutionGroupCount);
+CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc
index 6feba42c0d9d618d27da12e6a6752058b296995e..c8c09ae608fa7cc67a54fada9cfe86b40096a9fd 100644
--- a/paddle/platform/dynload/dynamic_loader.cc
+++ b/paddle/platform/dynload/dynamic_loader.cc
@@ -78,12 +78,11 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
     *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
     if (nullptr == *dso_handle) {
       if (dso_path == "libcudnn.dylib") {
-        PADDLE_ENFORCE(true,
-                       "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
-                       "For instance, sudo tar -xzf "
-                       "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
-                       "chmod a+r /usr/local/cuda/include/cudnn.h "
-                       "/usr/local/cuda/lib/libcudnn*");
+        LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
+                        "For instance, sudo tar -xzf "
+                        "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
+                        "chmod a+r /usr/local/cuda/include/cudnn.h "
+                        "/usr/local/cuda/lib/libcudnn*";
       }
     }
   }
@@ -92,7 +91,8 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
 
 static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
                                               const std::string& dso_name,
-                                              void** dso_handle) {
+                                              void** dso_handle,
+                                              bool throw_on_error = true) {
   int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
   *dso_handle = nullptr;
 
@@ -111,15 +111,19 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
       GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
     }
   }
-  PADDLE_ENFORCE(nullptr != *dso_handle,
-                 "Failed to find dynamic library: %s ( %s ) \n Please specify "
-                 "its path correctly using following ways: \n Method. set "
-                 "environment variable LD_LIBRARY_PATH on Linux or "
-                 "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
-                 "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
-                 "using the DYLD_LIBRARY_PATH is impossible unless System "
-                 "Integrity Protection (SIP) is disabled.",
-                 dlPath, dlerror());
+  auto error_msg =
+      "Failed to find dynamic library: %s ( %s ) \n Please specify "
+      "its path correctly using following ways: \n Method. set "
+      "environment variable LD_LIBRARY_PATH on Linux or "
+      "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
+      "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
+      "using the DYLD_LIBRARY_PATH is impossible unless System "
+      "Integrity Protection (SIP) is disabled.";
+  if (throw_on_error) {
+    PADDLE_ENFORCE(nullptr != *dso_handle, error_msg, dlPath, dlerror());
+  } else if (nullptr == *dso_handle) {
+    LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror());
+  }
 }
 
 void GetCublasDsoHandle(void** dso_handle) {
@@ -130,11 +134,12 @@ void GetCublasDsoHandle(void** dso_handle) {
 #endif
 }
 
-void GetCudnnDsoHandle(void** dso_handle) {
+void GetCUDNNDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle,
+                             false);
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false);
 #endif
 }
 
diff --git a/paddle/platform/dynload/dynamic_loader.h b/paddle/platform/dynload/dynamic_loader.h
index c0e5452e5ae723ec314ebafde86a6ff63980be00..7b0c8c16d7484480550f3c753fe52d1f04651900 100644
--- a/paddle/platform/dynload/dynamic_loader.h
+++ b/paddle/platform/dynload/dynamic_loader.h
@@ -32,7 +32,7 @@ void GetCublasDsoHandle(void** dso_handle);
  * @param    **dso_handle   dso handler
  *
  */
-void GetCudnnDsoHandle(void** dso_handle);
+void GetCUDNNDsoHandle(void** dso_handle);
 
 /**
  * @brief    load the DSO of CURAND
diff --git a/paddle/platform/dynload/nccl.cc b/paddle/platform/dynload/nccl.cc
index 8f92b8d94d56047b7d3fb43b15e3c06575c8d57b..4cec829a8ad8994d4a7643613331881e3a397b9a 100644
--- a/paddle/platform/dynload/nccl.cc
+++ b/paddle/platform/dynload/nccl.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/platform/dynload/nccl.h"
 
@@ -25,6 +25,11 @@ void *nccl_dso_handle;
 
 NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
+void LoadNCCLDSO() {
+  platform::call_once(nccl_dso_flag,
+                      [] { GetNCCLDsoHandle(&nccl_dso_handle); });
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h
index 981b2ab258a34ce92f02ee12b5957f88ba61d1c0..6c776afc97a53c964f0bc2b2a8abf2c29f474d3f 100644
--- a/paddle/platform/dynload/nccl.h
+++ b/paddle/platform/dynload/nccl.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -28,18 +28,18 @@ extern std::once_flag nccl_dso_flag;
 extern void* nccl_dso_handle;
 
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                         \
-  struct DynLoad__##__name {                                           \
-    template <typename... Args>                                        \
-    auto operator()(Args... args) -> decltype(__name(args...)) {       \
-      using nccl_func = decltype(__name(args...)) (*)(Args...);        \
-      platform::call_once(nccl_dso_flag,                               \
-                          paddle::platform::dynload::GetNCCLDsoHandle, \
-                          &nccl_dso_handle);                           \
-      void* p_##__name = dlsym(nccl_dso_handle, #__name);              \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);         \
-    }                                                                  \
-  };                                                                   \
+extern void LoadNCCLDSO();
+
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                   \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      using nccl_func = decltype(__name(args...)) (*)(Args...);  \
+      paddle::platform::dynload::LoadNCCLDSO();                  \
+      void* p_##__name = dlsym(nccl_dso_handle, #__name);        \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
+    }                                                            \
+  };                                                             \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
@@ -63,6 +63,8 @@ extern void* nccl_dso_handle;
   __macro(ncclAllReduce);               \
   __macro(ncclBcast);                   \
   __macro(ncclAllGather);               \
+  __macro(ncclGroupStart);              \
+  __macro(ncclGroupEnd);                \
   __macro(ncclReduce);                  \
   __macro(ncclGetErrorString);
 
diff --git a/paddle/platform/dynload/warpctc.cc b/paddle/platform/dynload/warpctc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9b7d01a6e8f6cccc0082f65f9511085d2a3111b7
--- /dev/null
+++ b/paddle/platform/dynload/warpctc.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/dynload/warpctc.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag warpctc_dso_flag;
+void* warpctc_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+WARPCTC_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/warpctc.h b/paddle/platform/dynload/warpctc.h
new file mode 100644
index 0000000000000000000000000000000000000000..acafcaff2ccc33c315f216e31b866bf4c8ae1fec
--- /dev/null
+++ b/paddle/platform/dynload/warpctc.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <dlfcn.h>
+#include <mutex>
+#include "ctc.h"
+#include "paddle/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag warpctc_dso_flag;
+extern void* warpctc_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load warpctc routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                            \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> decltype(__name(args...)) {     \
+      using warpctcFunc = decltype(__name(args...)) (*)(Args...);    \
+      std::call_once(warpctc_dso_flag,                               \
+                     paddle::platform::dynload::GetWarpCTCDsoHandle, \
+                     &warpctc_dso_handle);                           \
+      void* p_##_name = dlsym(warpctc_dso_handle, #__name);          \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
+  DYNAMIC_LOAD_WARPCTC_WRAP(__name)
+
+#define WARPCTC_ROUTINE_EACH(__macro) \
+  __macro(get_warpctc_version);       \
+  __macro(ctcGetStatusString);        \
+  __macro(compute_ctc_loss);          \
+  __macro(get_workspace_size)
+
+WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);
+
+#undef DYNAMIC_LOAD_WARPCTC_WRAP
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/operators/pool_cudnn_op.h b/paddle/platform/enforce.cc
similarity index 81%
rename from paddle/operators/pool_cudnn_op.h
rename to paddle/platform/enforce.cc
index 5adf27f5bccae8542719612320bc6dbe21007634..e8d31bc782ec3cddd18ceaedf88fe5e7b4aed2cc 100644
--- a/paddle/operators/pool_cudnn_op.h
+++ b/paddle/platform/enforce.cc
@@ -1,19 +1,19 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/pool_op.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
-namespace operators {}  // namespace operators
+namespace platform {}  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index bfe708748a62ff9ac5d151bc652142e1f4925c83..d1c7be0790b5e11d6273efe6c08cdb7bf22425c6 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 
+#include "paddle/platform/macros.h"
 #include "paddle/string/printf.h"
 #include "paddle/string/to_string.h"
 
@@ -49,7 +50,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-namespace {
 #ifdef __GNUC__
 inline std::string demangle(std::string name) {
   int status = -4;  // some arbitrary value to eliminate the compiler warning
@@ -60,7 +60,6 @@ inline std::string demangle(std::string name) {
 #else
 inline std::string demangle(std::string name) { return name; }
 #endif
-}
 
 struct EnforceNotMet : public std::exception {
   std::exception_ptr exp_;
@@ -236,16 +235,24 @@ inline void throw_on_error(T e) {
   __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
 #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
   __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
-#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                            \
-  PADDLE_ENFORCE(nullptr != (__VAL), #__VAL " should not be null\n%s", \
-                 paddle::string::Sprintf("" __VA_ARGS__));
-
-#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)        \
-  PADDLE_ENFORCE(__VAL0 __CMP __VAL1,                                         \
-                 "enforce %s " #__CMP " %s failed, %s " #__INV_CMP " %s\n%s", \
-                 #__VAL0, #__VAL1, paddle::string::to_string(__VAL0),         \
-                 paddle::string::to_string(__VAL1),                           \
-                 paddle::string::Sprintf("" __VA_ARGS__));
+#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
+  do {                                                       \
+    if (UNLIKELY(nullptr == (__VAL))) {                      \
+      PADDLE_THROW(#__VAL " should not be null\n%s",         \
+                   paddle::string::Sprintf("" __VA_ARGS__)); \
+    }                                                        \
+  } while (0)
+
+#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)  \
+  do {                                                                  \
+    if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) {                           \
+      PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP    \
+                   " %s\n%s",                                           \
+                   #__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \
+                   paddle::string::to_string(__VAL1),                   \
+                   paddle::string::Sprintf("" __VA_ARGS__));            \
+    }                                                                   \
+  } while (0)
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/for_range.h b/paddle/platform/for_range.h
new file mode 100644
index 0000000000000000000000000000000000000000..694a66d9ac4eb6ad02daf1931806fa1287de7cab
--- /dev/null
+++ b/paddle/platform/for_range.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace platform {
+
+template <typename DeviceContext>
+struct ForRange {
+  ForRange(const DeviceContext& dev_ctx, size_t limit);
+
+  template <typename Function>
+  void operator()(Function func) const;
+};
+
+template <>
+struct ForRange<CPUDeviceContext> {
+  ForRange(const CPUDeviceContext& dev_ctx, size_t limit) : limit_(limit) {}
+
+  template <typename Function>
+  void operator()(Function func) const {
+    for (size_t i = 0; i < limit_; ++i) {
+      func(i);
+    }
+  }
+
+  size_t limit_;
+};
+
+#ifdef __NVCC__
+template <typename Function>
+__global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
+  size_t idx = static_cast<size_t>(threadIdx.x);
+  func(idx);
+}
+
+template <typename Function>
+__global__ static void ForRangeElemwiseOp(Function func, int limit) {
+  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx < limit) {
+    func(idx);
+  }
+}
+
+template <>
+struct ForRange<CUDADeviceContext> {
+  ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
+      : dev_ctx_(dev_ctx), limit_(static_cast<int>(limit)) {}
+
+  template <typename Function>
+  inline void operator()(Function func) const {
+    constexpr int num_threads = 1024;
+    int block_size = limit_ <= num_threads ? limit_ : num_threads;
+    int grid_size = (limit_ + num_threads - 1) / num_threads;
+
+    if (grid_size == 1) {
+      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
+          func);
+    } else {
+      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
+          func, limit_);
+    }
+  }
+
+  const CUDADeviceContext& dev_ctx_;
+  int limit_;
+};
+
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index 36b216d872138d49bfd5ab6e3499d15d49ebd0ca..7037551d7544d6fea54e2f4bf887309b7dc5a52e 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -18,8 +18,8 @@ limitations under the License. */
 
 #include "paddle/platform/enforce.h"
 
-DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
-              "Default use 95% of GPU memory for PaddlePaddle,"
+DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
+              "Default use 92% of GPU memory for PaddlePaddle,"
               "reserve the rest for page tables, etc");
 
 namespace paddle {
@@ -73,17 +73,22 @@ size_t GpuMaxChunkSize() {
   size_t available = 0;
 
   GpuMemoryUsage(available, total);
+  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
+           << total / 1024 / 1024 << "M";
+  size_t reserving = static_cast<size_t>(0.05 * total);
+  // If available less than minimum chunk size, no usable memory exists.
+  available =
+      std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
+               total - reserving);
 
   // Reserving the rest memory for page tables, etc.
-  size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;
 
-  // If available less than minimum chunk size, no usable memory exists.
-  available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize();
+  size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
+                                          (total - reserving));
 
-  // If available less than reserving, no usable memory exists.
-  size_t usable = std::max(available, reserving) - reserving;
+  PADDLE_ENFORCE_LE(allocating, available);
 
-  return usable;
+  return allocating;
 }
 
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
@@ -92,17 +97,6 @@ void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                  "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
 }
 
-void GpuMemcpySync(void *dst, const void *src, size_t count,
-                   enum cudaMemcpyKind kind) {
-  PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
-                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync");
-  // note: cudaMemcpy may actually be asynchronous with respect to the caller,
-  //       block on stream 0 to make sure the copy has completed
-  PADDLE_ENFORCE(
-      cudaStreamSynchronize(0),
-      "cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync");
-}
-
 void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
                    size_t count, cudaStream_t stream) {
   PADDLE_ENFORCE(
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index db961f3838af73855312d4cf6a80e2355306e08f..d05131fa4196057d19a8ae57bf4574c666e409cf 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -52,10 +52,6 @@ size_t GpuMaxChunkSize();
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                     enum cudaMemcpyKind kind, cudaStream_t stream);
 
-//! Copy memory from address src to dst synchronously.
-void GpuMemcpySync(void *dst, const void *src, size_t count,
-                   enum cudaMemcpyKind kind);
-
 //! Copy memory from one device to another device.
 void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
                    size_t count, cudaStream_t stream);
diff --git a/paddle/platform/hostdevice.h b/paddle/platform/hostdevice.h
index eb2df291cceef553d6422e6166e1fef2c63e2a47..fa4659ed2988a2199d8a8450f825d31fd0ca5907 100644
--- a/paddle/platform/hostdevice.h
+++ b/paddle/platform/hostdevice.h
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #pragma once
 
 #ifdef __CUDACC__
diff --git a/paddle/platform/mkldnn_helper.h b/paddle/platform/mkldnn_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd52a8b4c434071a030a8e7a8a70fc3adba8460c
--- /dev/null
+++ b/paddle/platform/mkldnn_helper.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mkldnn.hpp>
+
+namespace paddle {
+namespace platform {
+
+using MKLDNNStream = mkldnn::stream;
+using MKLDNNEngine = mkldnn::engine;
+using MKLDNNMemory = mkldnn::memory;
+using MKLDNNPrimitive = mkldnn::primitive;
+using MKLDNNPrimitiveDesc = mkldnn::handle<mkldnn_primitive_desc_t>;
+
+typedef std::unique_ptr<MKLDNNStream> MKLDNNStreamPtr;
+typedef std::unique_ptr<MKLDNNEngine> MKLDNNEnginePtr;
+typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
+typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr;
+typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr;
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu
index c99dae68bef67c58d3efea42fef45e84bb3d9255..ef6d845874745af1150e4425f8d6be416cc44ece 100644
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@@ -1,28 +1,30 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/device_vector.h>
+#include <memory>
+#include <vector>
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/dynload/nccl.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/gpu_info.h"
 
-#include <thrust/device_vector.h>
-#include <memory>
-#include <vector>
-
 static int dev_count = 0;
 
 namespace paddle {
@@ -32,6 +34,7 @@ TEST(NCCL, init) {
   std::vector<ncclComm_t> comms;
   comms.resize(dev_count);
   PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
+
   for (int i = 0; i < dev_count; ++i) {
     dynload::ncclCommDestroy(comms[i]);
   }
@@ -47,7 +50,7 @@ struct PerThreadData {
 
   T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); }
 
-  PerThreadData(int gpu_id, size_t size) : dev_ctx(GPUPlace(gpu_id)) {
+  PerThreadData(int gpu_id, size_t size) : dev_ctx(CUDAPlace(gpu_id)) {
     send_buff.resize(size);
     for (size_t i = 0; i < size; ++i) {
       send_buff[i] = static_cast<T>(i);
@@ -62,7 +65,7 @@ TEST(NCCL, all_reduce) {
   std::vector<ncclComm_t> comms;
   comms.resize(dev_count);
   VLOG(1) << "Initializing ncclComm";
-  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
+  dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
   VLOG(1) << "ncclComm initialized";
   VLOG(1) << "Creating thread data";
   std::vector<std::unique_ptr<PerThreadData<double>>> data;
@@ -131,6 +134,18 @@ int main(int argc, char** argv) {
         << dev_count;
     return 0;
   }
+
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::CUDAPlace(i));
+  }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Init(places);
+
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
index 856e54df89c1c18ade040957188a2fbda0901473..f05260ccac4c2b11e5568d484141720fa4792d13 100644
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/platform/place.h"
 
@@ -23,7 +23,9 @@ class PlacePrinter : public boost::static_visitor<> {
  public:
   explicit PlacePrinter(std::ostream &os) : os_(os) {}
   void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
-  void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; }
+  void operator()(const CUDAPlace &p) {
+    os_ << "CUDAPlace(" << p.device << ")";
+  }
 
  private:
   std::ostream &os_;
@@ -36,20 +38,31 @@ static Place the_default_place;
 void set_place(const Place &place) { the_default_place = place; }
 const Place &get_place() { return the_default_place; }
 
-const GPUPlace default_gpu() { return GPUPlace(0); }
+const CUDAPlace default_gpu() { return CUDAPlace(0); }
 const CPUPlace default_cpu() { return CPUPlace(); }
 
 bool is_gpu_place(const Place &p) {
-  return boost::apply_visitor(IsGPUPlace(), p);
-}
-bool is_cpu_place(const Place &p) {
-  return !boost::apply_visitor(IsGPUPlace(), p);
+  return boost::apply_visitor(IsCUDAPlace(), p);
 }
 
+bool is_cpu_place(const Place &p) { return !is_gpu_place(p); }
+
 bool places_are_same_class(const Place &p1, const Place &p2) {
   return p1.which() == p2.which();
 }
 
+bool is_same_place(const Place &p1, const Place &p2) {
+  if (places_are_same_class(p1, p2)) {
+    if (is_cpu_place(p1)) {
+      return true;
+    } else {
+      return boost::get<CUDAPlace>(p1) == boost::get<CUDAPlace>(p2);
+    }
+  } else {
+    return false;
+  }
+}
+
 std::ostream &operator<<(std::ostream &os, const Place &p) {
   detail::PlacePrinter printer(os);
   boost::apply_visitor(printer, p);
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index 5370360a7de26e409a1545182a12d3df1f37658b..fbb43fa043a44c302e6b1cc67d83c18d791f07c5 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <iostream>
-
+#include "paddle/platform/enforce.h"
 #include "paddle/platform/variant.h"
 
 namespace paddle {
@@ -31,46 +31,67 @@ struct CPUPlace {
   inline bool operator!=(const CPUPlace &) const { return false; }
 };
 
-struct GPUPlace {
-  GPUPlace() : GPUPlace(0) {}
-  explicit GPUPlace(int d) : device(d) {}
+struct CUDAPlace {
+  CUDAPlace() : CUDAPlace(0) {}
+  explicit CUDAPlace(int d) : device(d) {}
 
   inline int GetDeviceId() const { return device; }
   // needed for variant equality comparison
-  inline bool operator==(const GPUPlace &o) const { return device == o.device; }
-  inline bool operator!=(const GPUPlace &o) const { return !(*this == o); }
+  inline bool operator==(const CUDAPlace &o) const {
+    return device == o.device;
+  }
+  inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
 
   int device;
 };
 
-struct IsGPUPlace : public boost::static_visitor<bool> {
+struct IsCUDAPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const GPUPlace &gpu) const { return true; }
+  bool operator()(const CUDAPlace &gpu) const { return true; }
 };
 
-// Define the max number of Place in bit length. i.e., the max number of places
-// should be less equal than 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
-#define NUM_PLACE_TYPE_LIMIT_IN_BIT 4
-
-typedef boost::variant<GPUPlace, CPUPlace> Place;
+typedef boost::variant<CUDAPlace, CPUPlace> Place;
 
-// static check number of place types is less equal than
-// 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
-BOOST_MPL_ASSERT((boost::mpl::less_equal<
-                  Place::types::size,
-                  boost::mpl::long_<1 << NUM_PLACE_TYPE_LIMIT_IN_BIT>>));
+using PlaceList = std::vector<Place>;
 
 void set_place(const Place &);
 const Place &get_place();
 
-const GPUPlace default_gpu();
+const CUDAPlace default_gpu();
 const CPUPlace default_cpu();
 
 bool is_gpu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
+bool is_same_place(const Place &, const Place &);
 
 std::ostream &operator<<(std::ostream &, const Place &);
 
+template <typename Visitor>
+struct PlaceVisitorWrapper
+    : public boost::static_visitor<typename Visitor::result_type> {
+  const Visitor &visitor_;
+  explicit PlaceVisitorWrapper(const Visitor &visitor) : visitor_(visitor) {}
+
+  typename Visitor::result_type operator()(const CPUPlace &cpu) const {
+    return visitor_(cpu);
+  }
+
+  typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
+#ifdef PADDLE_WITH_CUDA
+    return visitor_(cuda);
+#else
+    PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device");
+    return typename Visitor::result_type();
+#endif
+  }
+};
+
+template <typename Visitor>
+typename Visitor::result_type VisitPlace(const Place &place,
+                                         const Visitor &visitor) {
+  return boost::apply_visitor(PlaceVisitorWrapper<Visitor>(visitor), place);
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/place_test.cc b/paddle/platform/place_test.cc
index 33e2e5a439ce6801c02daba4bcbd462a74d7a614..150b2d3b1fbacec18ea33156f30f1c9965aedb31 100644
--- a/paddle/platform/place_test.cc
+++ b/paddle/platform/place_test.cc
@@ -1,10 +1,23 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #include "paddle/platform/place.h"
 #include <sstream>
 #include "gtest/gtest.h"
 
 TEST(Place, Equality) {
   paddle::platform::CPUPlace cpu;
-  paddle::platform::GPUPlace g0(0), g1(1), gg0(0);
+  paddle::platform::CUDAPlace g0(0), g1(1), gg0(0);
 
   EXPECT_EQ(cpu, cpu);
   EXPECT_EQ(g0, g0);
@@ -22,6 +35,7 @@ TEST(Place, Default) {
   EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu()));
   EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu()));
 
+  EXPECT_FALSE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
   paddle::platform::set_place(paddle::platform::CPUPlace());
   EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
 }
@@ -29,8 +43,8 @@ TEST(Place, Default) {
 TEST(Place, Print) {
   {
     std::stringstream ss;
-    ss << paddle::platform::GPUPlace(1);
-    EXPECT_EQ("GPUPlace(1)", ss.str());
+    ss << paddle::platform::CUDAPlace(1);
+    EXPECT_EQ("CUDAPlace(1)", ss.str());
   }
   {
     std::stringstream ss;
diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e2e2d968ef877f6aa8b87ab8f044e89574dffa9
--- /dev/null
+++ b/paddle/platform/profiler.cc
@@ -0,0 +1,329 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/profiler.h"
+#include <iomanip>
+#include <map>
+#include "glog/logging.h"
+
+namespace paddle {
+namespace platform {
+
+// The profiler state, the initial value is ProfilerState::kDisabled
+static ProfilerState g_state = ProfilerState::kDisabled;
+// To record which timer the profiler used, CUDA or CPU.
+static std::string g_profiler_place = "";
+// The thread local event list only can be accessed by the specific thread
+// The thread index of each thread
+static thread_local int32_t g_thread_id;
+// The g_next_thread_id is a global counter for threads, by the g_thread_id and
+// g_next_thread_id, we can know how many threads have created EventList.
+static uint32_t g_next_thread_id = 0;
+// The global mutex
+static std::mutex g_all_event_lists_mutex;
+// The total event lists of all threads
+static std::list<std::shared_ptr<EventList>> g_all_event_lists;
+// The thread local event list only can be accessed by the specific thread
+static thread_local std::shared_ptr<EventList> g_event_list;
+
+inline uint64_t GetTimeInNsec() {
+  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
+                                 std::chrono::high_resolution_clock,
+                                 std::chrono::steady_clock>::type;
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             clock::now().time_since_epoch())
+      .count();
+}
+
+Event::Event(EventKind kind, std::string name, uint32_t thread_id,
+             DeviceContext* dev_ctx)
+    : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
+#ifdef PADDLE_WITH_CUDA
+  auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
+  if (cuda_dev_ctx) {
+    PADDLE_ENFORCE(cudaGetDevice(&device_));
+    PADDLE_ENFORCE(cudaEventCreate(&event_));
+    auto stream = cuda_dev_ctx->stream();
+    PADDLE_ENFORCE(cudaEventRecord(event_, stream));
+    has_cuda_ = true;
+  }
+#endif
+  cpu_ns_ = GetTimeInNsec();
+}
+
+std::string Event::kind() const {
+  switch (kind_) {
+    case EventKind::kMark:
+      return "mark";
+    case EventKind::kPushRange:
+      return "push";
+    case EventKind::kPopRange:
+      return "pop";
+  }
+  PADDLE_THROW("Unknown EventKind.");
+}
+
+double Event::CpuElapsedMs(const Event& e) const {
+  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
+}
+
+double Event::CudaElapsedMs(const Event& e) const {
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE(e.has_cuda() && has_cuda());
+  PADDLE_ENFORCE(e.device() == device());
+  PADDLE_ENFORCE(cudaEventSynchronize(event_));
+  PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
+  float ms;
+  PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
+  return ms;
+#else
+  PADDLE_THROW("CUDA is not enabled");
+#endif
+}
+
+#ifdef PADDLE_WITH_CUDA
+static void ForEachDevice(std::function<void(int)> func) {
+  auto original_device = GetCurrentDeviceId();
+  int count = GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    SetDeviceId(i);
+    func(i);
+  }
+  SetDeviceId(original_device);
+}
+#endif
+
+inline EventList& GetEventList() {
+  if (!g_event_list) {
+    std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+    g_event_list = std::make_shared<EventList>();
+    g_thread_id = g_next_thread_id++;
+    g_all_event_lists.emplace_front(g_event_list);
+  }
+  return *g_event_list;
+}
+
+void Mark(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
+}
+
+void PushEvent(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
+}
+
+void PopEvent(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
+}
+
+RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
+  if (g_state == ProfilerState::kDisabled) return;
+  dev_ctx_ = dev_ctx;
+  name_ = name;
+  PushEvent(name_, dev_ctx_);
+}
+
+RecordEvent::~RecordEvent() {
+  if (g_state == ProfilerState::kDisabled) return;
+  PopEvent(name_, dev_ctx_);
+}
+
+void EnableProfiler(ProfilerState state) {
+  PADDLE_ENFORCE(state != ProfilerState::kDisabled,
+                 "Can't enbale profling, since the input state is ",
+                 "ProfilerState::kDisabled");
+  PADDLE_ENFORCE(g_state == ProfilerState::kDisabled,
+                 "The profiling state should be disabled when calling ",
+                 "EnableProfiler.");
+  g_state = state;
+  g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU";
+#ifdef PADDLE_WITH_CUDA
+  if (g_state == ProfilerState::kCUDA) {
+    // Generate some dummy evenets first to reduce the startup overhead.
+    for (int i = 0; i < 5; i++) {
+      ForEachDevice([](int d) {
+        DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
+        Mark("_cuda_startup_", dev_ctx);
+        dev_ctx->Wait();
+      });
+    }
+  }
+#endif
+  // Mark the profiling start.
+  Mark("_start_profiler_", nullptr);
+}
+
+std::vector<std::vector<Event>> DisableProfiler() {
+  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
+                 "Can't disable profiling, since it's not starting.");
+  // Mark the profiling stop.
+  Mark("_stop_profiler_", nullptr);
+  g_state = ProfilerState::kDisabled;
+  std::vector<std::vector<Event>> result;
+  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
+       ++it) {
+    result.emplace_back((*it)->Reduce());
+  }
+  return result;
+}
+
+void ParseEvents(std::vector<std::vector<Event>>& events,
+                 EventSortingKey sorted_by) {
+  if (g_profiler_place == "") return;
+
+  std::string sorted_domain;
+  std::function<bool(const EventItem&, const EventItem&)> sorted_func;
+  switch (sorted_by) {
+    case EventSortingKey::kCalls:
+      sorted_domain = "number of calls";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.calls > b.calls;
+      };
+      break;
+    case EventSortingKey::kTotal:
+      sorted_domain = "total time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.total_time > b.total_time;
+      };
+      break;
+    case EventSortingKey::kMin:
+      sorted_domain = "minimum time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.min_time > b.min_time;
+      };
+      break;
+    case EventSortingKey::kMax:
+      sorted_domain = "maximum time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.max_time > b.max_time;
+      };
+      break;
+    case EventSortingKey::kAve:
+      sorted_domain = "average time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.ave_time > b.ave_time;
+      };
+      break;
+    default:
+      sorted_domain = "event end time";
+  }
+
+  std::vector<std::vector<EventItem>> events_table;
+  size_t max_name_width = 0;
+  for (size_t i = 0; i < events.size(); i++) {
+    std::list<Event> pushed_events;
+    std::vector<EventItem> event_items;
+    std::unordered_map<std::string, int> event_idx;
+
+    for (size_t j = 0; j < events[i].size(); j++) {
+      if (events[i][j].kind() == "push") {
+        pushed_events.push_back(events[i][j]);
+      } else if (events[i][j].kind() == "pop") {
+        std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
+        while (rit != pushed_events.rend() &&
+               rit->name() != events[i][j].name()) {
+          ++rit;
+        }
+
+        if (rit != pushed_events.rend()) {
+          double event_time = (g_profiler_place == "CUDA")
+                                  ? rit->CudaElapsedMs(events[i][j])
+                                  : rit->CpuElapsedMs(events[i][j]);
+          std::string event_name =
+              "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
+          max_name_width = std::max(max_name_width, event_name.size());
+
+          if (event_idx.find(event_name) == event_idx.end()) {
+            event_idx[event_name] = event_items.size();
+            EventItem event_item = {event_name, 1,          event_time,
+                                    event_time, event_time, event_time};
+            event_items.push_back(event_item);
+          } else {
+            int index = event_idx[event_name];
+            event_items[index].calls += 1;
+            // total time
+            event_items[index].total_time += event_time;
+            // min time
+            event_items[index].min_time =
+                std::min(event_time, event_items[index].min_time);
+            // max time
+            event_items[index].max_time =
+                std::max(event_time, event_items[index].max_time);
+          }
+
+          // remove the push marker from the list
+          pushed_events.erase((++rit).base());
+        } else {
+          LOG(WARNING) << "Cannot find the push marker of event \'"
+                       << events[i][j].name()
+                       << "\', which will be ignored in profiling report.";
+        }
+      }
+    }
+    // average time
+    for (auto& item : event_items) {
+      item.ave_time = item.total_time / item.calls;
+    }
+    // sort
+    if (sorted_by != EventSortingKey::kDefault) {
+      std::sort(event_items.begin(), event_items.end(), sorted_func);
+    }
+
+    events_table.push_back(event_items);
+    // log warning if there are events with `push` but without `pop`
+    std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
+    while (rit != pushed_events.rend()) {
+      LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
+                   << "\', which will be ignored in profiling report.";
+      ++rit;
+    }
+  }
+
+  // Print report
+  PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12);
+}
+
+void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
+                          std::string& sorted_domain, const size_t name_width,
+                          const size_t data_width) {
+  // Output header information
+  std::cout << "\n------------------------->"
+            << "     Profiling Report     "
+            << "<-------------------------\n\n";
+  std::cout << "Place: " << g_profiler_place << std::endl;
+  std::cout << "Time unit: ms" << std::endl;
+  std::cout << "Sorted by " << sorted_domain
+            << " in descending order in the same thread\n\n";
+  // Output events table
+  std::cout.setf(std::ios::left);
+  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
+            << "Calls" << std::setw(data_width) << "Total"
+            << std::setw(data_width) << "Min." << std::setw(data_width)
+            << "Max." << std::setw(data_width) << "Ave." << std::endl;
+  for (size_t i = 0; i < events_table.size(); ++i) {
+    for (size_t j = 0; j < events_table[i].size(); ++j) {
+      EventItem& event_item = events_table[i][j];
+      std::cout << std::setw(name_width) << event_item.name
+                << std::setw(data_width) << event_item.calls
+                << std::setw(data_width) << event_item.total_time
+                << std::setw(data_width) << event_item.min_time
+                << std::setw(data_width) << event_item.max_time
+                << std::setw(data_width) << event_item.ave_time << std::endl;
+    }
+  }
+  std::cout << std::endl;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..6df48ef8806e865f473b4317ac0283863c3c6f64
--- /dev/null
+++ b/paddle/platform/profiler.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <forward_list>
+#include <list>
+#include <mutex>
+#include <vector>
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace platform {
+
+enum EventKind { kMark, kPushRange, kPopRange };
+
+class Event {
+ public:
+  // The DeviceContext is used to get the cuda stream.
+  // If CPU profiling mode, can pass nullptr.
+  Event(EventKind kind, std::string name, uint32_t thread_id,
+        DeviceContext* dev_ctx);
+
+  std::string kind() const;
+  std::string name() const { return name_; }
+  uint32_t thread_id() const { return thread_id_; }
+  bool has_cuda() const { return has_cuda_; }
+
+#ifdef PADDLE_WITH_CUDA
+  cudaEvent_t event() const { return event_; }
+  int device() const { return device_; }
+#endif
+
+  double CpuElapsedMs(const Event& e) const;
+  double CudaElapsedMs(const Event& e) const;
+
+ private:
+  EventKind kind_;
+  std::string name_;
+  uint32_t thread_id_;
+  int64_t cpu_ns_;
+  bool has_cuda_;
+#ifdef PADDLE_WITH_CUDA
+  cudaEvent_t event_ = nullptr;
+  int device_ = -1;
+#endif
+};
+
+struct EventList {
+  constexpr static size_t kMB = 1024 * 1024;
+  constexpr static size_t kEventBlockSize = 16 * kMB;
+  constexpr static size_t kEventSize = sizeof(Event);
+  constexpr static size_t kEventAlign = alignof(Event);
+  constexpr static size_t kNumBlock =
+      kEventBlockSize /
+      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
+
+  template <typename... Args>
+  void Record(Args&&... args) {
+    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
+      event_blocks.emplace_front();
+      event_blocks.front().reserve(kNumBlock);
+    }
+    event_blocks.front().emplace_back(std::forward<Args>(args)...);
+  }
+
+  std::vector<Event> Reduce() {
+    std::vector<Event> result;
+    for (auto& block : event_blocks) {
+      result.insert(result.begin(), std::make_move_iterator(block.begin()),
+                    std::make_move_iterator(block.end()));
+    }
+    event_blocks.clear();
+    return result;
+  }
+
+  std::forward_list<std::vector<Event>> event_blocks;
+};
+
+enum ProfilerState {
+  kDisabled,  // disabled state
+  kCPU,       // CPU profiling state
+  kCUDA,      // GPU profiling state
+};
+
+void Mark(const std::string& name, DeviceContext* dev_ctx);
+
+void PushEvent(const std::string& name, DeviceContext* dev_ctx);
+
+void PopEvent(const std::string& name, DeviceContext* dev_ctx);
+
+struct RecordEvent {
+  explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
+
+  ~RecordEvent();
+
+  // The device context is used by Event to get the current cuda stream.
+  DeviceContext* dev_ctx_;
+  // Event name
+  std::string name_;
+};
+
+// Enable the profiling function.
+void EnableProfiler(ProfilerState state);
+
+// Return the event list of all threads. Asummed the returned value calls
+// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
+std::vector<std::vector<Event>> DisableProfiler();
+
+// The information of each event given in the profiling report
+struct EventItem {
+  std::string name;
+  int calls;
+  double total_time;
+  double min_time;
+  double max_time;
+  double ave_time;
+};
+
+// Candidate keys to sort the profiling report
+enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
+
+// Parse the event list and output the profiling report
+void ParseEvents(std::vector<std::vector<Event>>&,
+                 EventSortingKey sorted_by = EventSortingKey::kDefault);
+
+// Print results
+void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
+                          std::string& sorted_domain, const size_t name_width,
+                          const size_t data_width);
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..13dea713c71e147ed5dd8d090e92d86c96256c09
--- /dev/null
+++ b/paddle/platform/profiler_test.cc
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/profiler.h"
+#include "gtest/gtest.h"
+
+TEST(Event, CpuElapsedTime) {
+  using paddle::platform::Event;
+  using paddle::platform::EventKind;
+
+  Event start_event(EventKind::kPushRange, "test", 0, nullptr);
+  EXPECT_TRUE(start_event.has_cuda() == false);
+  int counter = 0;
+  while (counter != 1000) {
+    counter++;
+  }
+  Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
+  EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(Event, CudaElapsedTime) {
+  using paddle::platform::DeviceContext;
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::CUDAPlace;
+  using paddle::platform::Event;
+  using paddle::platform::EventKind;
+
+  DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
+  Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
+  EXPECT_TRUE(start_event.has_cuda() == true);
+  int counter = 0;
+  while (counter != 1000) {
+    counter++;
+  }
+  Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
+  EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
+}
+#endif
+
+TEST(RecordEvent, RecordEvent) {
+  using paddle::platform::DeviceContext;
+  using paddle::platform::Event;
+  using paddle::platform::EventKind;
+  using paddle::platform::RecordEvent;
+  using paddle::platform::ProfilerState;
+  using paddle::platform::EventSortingKey;
+
+  ProfilerState state = ProfilerState::kCPU;
+  DeviceContext* dev_ctx = nullptr;
+#ifdef PADDLE_WITH_CUDA
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::CUDAPlace;
+  state = ProfilerState::kCUDA;
+  dev_ctx =
+      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
+#endif
+  EnableProfiler(state);
+
+  /* Usage 1:
+  *  PushEvent(evt_name, dev_ctx);
+  *  ...
+  *  code to be analyzed
+  *  ...
+  * PopEvent(evt_name, dev_ctx);
+  */
+  for (int loop = 0; loop < 3; ++loop) {
+    for (int i = 1; i < 5; ++i) {
+      std::string name = "op_" + std::to_string(i);
+      PushEvent(name, dev_ctx);
+      int counter = 1;
+      while (counter != i * 1000) counter++;
+      PopEvent(name, dev_ctx);
+    }
+  }
+
+  /* Usage 2:
+   * {
+   *   RecordEvent record_event(name, dev_ctx);
+   *   ...
+   *   code to be analyzed
+   *   ...
+   * }
+   */
+  for (int i = 1; i < 5; ++i) {
+    std::string name = "evs_op_" + std::to_string(i);
+    RecordEvent record_event(name, dev_ctx);
+    int counter = 1;
+    while (counter != i * 1000) counter++;
+  }
+
+  // Bad Usage:
+  PushEvent("event_without_pop", dev_ctx);
+  PopEvent("event_without_push", dev_ctx);
+  std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler();
+  // Will remove parsing-related code from test later
+  ParseEvents(events, EventSortingKey::kTotal);
+
+  int cuda_startup_count = 0;
+  int start_profiler_count = 0;
+  int stop_profiler_count = 0;
+  for (size_t i = 0; i < events.size(); ++i) {
+    for (size_t j = 0; j < events[i].size(); ++j) {
+      if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count;
+      if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
+      if (events[i][j].name() == "_stop_profiler_") ++stop_profiler_count;
+      if (events[i][j].name() == "push") {
+        EXPECT_EQ(events[i][j + 1].name(), "pop");
+#ifdef PADDLE_WITH_CUDA
+        EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
+#else
+        EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
+#endif
+      }
+    }
+  }
+  EXPECT_EQ(cuda_startup_count % 5, 0);
+  EXPECT_EQ(start_profiler_count, 1);
+  EXPECT_EQ(stop_profiler_count, 1);
+}
diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h
index bb9d59ec0a18ce013632f128c9b5d230255f1ac4..a88902b164c7a705cae09662e724b35cd8c8b4bf 100644
--- a/paddle/platform/transform.h
+++ b/paddle/platform/transform.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -31,7 +31,7 @@ namespace paddle {
 namespace platform {
 
 // Transform on host or device. It provides the same API in std library.
-template <typename Place>
+template <typename DeviceContext>
 struct Transform {
   template <typename InputIter, typename OutputIter, typename UnaryOperation>
   void operator()(const DeviceContext& context, InputIter first, InputIter last,
@@ -45,16 +45,16 @@ struct Transform {
 };
 
 template <>
-struct Transform<platform::CPUPlace> {
+struct Transform<platform::CPUDeviceContext> {
   template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(const DeviceContext& context, InputIter first, InputIter last,
-                  OutputIter result, UnaryOperation op) {
+  void operator()(const platform::CPUDeviceContext& context, InputIter first,
+                  InputIter last, OutputIter result, UnaryOperation op) {
     std::transform(first, last, result, op);
   }
 
   template <typename InputIter1, typename InputIter2, typename OutputIter,
             typename BinaryOperation>
-  void operator()(const DeviceContext& context, InputIter1 first1,
+  void operator()(const platform::CPUDeviceContext& context, InputIter1 first1,
                   InputIter1 last1, InputIter2 first2, OutputIter result,
                   BinaryOperation op) {
     std::transform(first1, last1, first2, result, op);
@@ -63,27 +63,25 @@ struct Transform<platform::CPUPlace> {
 
 #ifdef __NVCC__
 template <>
-struct Transform<platform::GPUPlace> {
+struct Transform<platform::CUDADeviceContext> {
   template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(const DeviceContext& context, InputIter first, InputIter last,
-                  OutputIter result, UnaryOperation op) {
+  void operator()(const platform::CUDADeviceContext& context, InputIter first,
+                  InputIter last, OutputIter result, UnaryOperation op) {
     auto place = context.GetPlace();
     PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
-    auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
-    thrust::transform(thrust::cuda::par.on(ctx.stream()),
+    thrust::transform(thrust::cuda::par.on(context.stream()),
                       details::DevPtrCast(first), details::DevPtrCast(last),
                       details::DevPtrCast(result), op);
   }
 
   template <typename InputIter1, typename InputIter2, typename OutputIter,
             typename BinaryOperation>
-  void operator()(const DeviceContext& context, InputIter1 first1,
+  void operator()(const platform::CUDADeviceContext& context, InputIter1 first1,
                   InputIter1 last1, InputIter2 first2, OutputIter result,
                   BinaryOperation op) {
     auto place = context.GetPlace();
     PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
-    auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
-    thrust::transform(thrust::cuda::par.on(ctx.stream()),
+    thrust::transform(thrust::cuda::par.on(context.stream()),
                       details::DevPtrCast(first1), details::DevPtrCast(last1),
                       details::DevPtrCast(first2), details::DevPtrCast(result),
                       op);
diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu
index c76cab80e4b0e8df98a7be15f86699cfb6f93af2..af9204a0a7b6e09fcfdacb8ba985e269665b4034 100644
--- a/paddle/platform/transform_test.cu
+++ b/paddle/platform/transform_test.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "paddle/memory/memcpy.h"
@@ -39,7 +39,7 @@ TEST(Transform, CPUUnary) {
   using namespace paddle::platform;
   CPUDeviceContext ctx;
   float buf[4] = {0.1, 0.2, 0.3, 0.4};
-  Transform<paddle::platform::CPUPlace> trans;
+  Transform<paddle::platform::CPUDeviceContext> trans;
   trans(ctx, buf, buf + 4, buf, Scale<float>(10));
   for (int i = 0; i < 4; ++i) {
     ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
@@ -49,15 +49,15 @@ TEST(Transform, CPUUnary) {
 TEST(Transform, GPUUnary) {
   using namespace paddle::platform;
   using namespace paddle::memory;
-  GPUPlace gpu0(0);
+  CUDAPlace gpu0(0);
   CUDADeviceContext ctx(gpu0);
   float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
   float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
-  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf));
-  Transform<paddle::platform::GPUPlace> trans;
+  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
+  Transform<paddle::platform::CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
   ctx.Wait();
-  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf));
+  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
   Free(gpu0, gpu_buf);
   for (int i = 0; i < 4; ++i) {
     ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
@@ -68,7 +68,7 @@ TEST(Transform, CPUBinary) {
   using namespace paddle::platform;
   using namespace paddle::memory;
   int buf[4] = {1, 2, 3, 4};
-  Transform<paddle::platform::CPUPlace> trans;
+  Transform<paddle::platform::CPUDeviceContext> trans;
   CPUDeviceContext ctx;
   trans(ctx, buf, buf + 4, buf, buf, Multiply<int>());
   for (int i = 0; i < 4; ++i) {
@@ -80,14 +80,14 @@ TEST(Transform, GPUBinary) {
   using namespace paddle::platform;
   using namespace paddle::memory;
   int buf[4] = {1, 2, 3, 4};
-  GPUPlace gpu0(0);
+  CUDAPlace gpu0(0);
   CUDADeviceContext ctx(gpu0);
   int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
-  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf));
-  Transform<paddle::platform::GPUPlace> trans;
+  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
+  Transform<paddle::platform::CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
   ctx.Wait();
-  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf));
+  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
   Free(gpu0, gpu_buf);
   for (int i = 0; i < 4; ++i) {
     ASSERT_EQ((i + 1) * (i + 1), buf[i]);
diff --git a/paddle/platform/variant.h b/paddle/platform/variant.h
index 619897ca19eb2e6f4dbfd9160edf8c4bc58c89a9..ea6ef8fddf5171731fa1fd323983bf7d90ecdd9a 100644
--- a/paddle/platform/variant.h
+++ b/paddle/platform/variant.h
@@ -1,19 +1,32 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
+#ifdef __CUDACC__
+#ifdef __CUDACC_VER_MAJOR__
+// CUDA 9 define `__CUDACC_VER__` as a warning message, manually define
+// __CUDACC_VER__ instead.
+#undef __CUDACC_VER__
+
+#define __CUDACC_VER__                                         \
+  (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 + \
+   __CUDACC_VER_BUILD__)
+#endif
+
+#endif
+
 #include <boost/config.hpp>
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
index ccfc0e76020c7b4f54a493cc4048e7571379ec1a..f75475a88f7224ee3889827795088c8aa920b63b 100644
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -49,7 +49,7 @@ if(WITH_TESTING)
   add_subdirectory(test)
 endif()
 
-if(NOT WITH_C_API)
+if(NOT MOBILE_INFERENCE)
   add_executable(paddle_pserver_main ${PSERVER_MAIN_SOURCES})
   link_paddle_exe(paddle_pserver_main)
 
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index a9bcc474387513a8ca019bc9382b88c93e08ff8d..7b374307071d2da91a677361b404448f1a3816b0 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1,8 +1,13 @@
 if(WITH_PYTHON)
   cc_library(paddle_pybind SHARED
-    SRCS pybind.cc exception.cc protobuf.cc
-    DEPS pybind python backward proto_desc tensor_array paddle_memory executor prune
+    SRCS pybind.cc exception.cc protobuf.cc const_value.cc
+    DEPS pybind python backward proto_desc paddle_memory executor prune init
     ${GLOB_OP_LIB})
+  if(NOT APPLE AND NOT ANDROID)
+    target_link_libraries(paddle_pybind rt)
+  endif(NOT APPLE AND NOT ANDROID)
 endif(WITH_PYTHON)
 
-cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB} tensor_array)
+if(WITH_DOC)
+  cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB})
+endif(WITH_DOC)
diff --git a/paddle/pybind/const_value.cc b/paddle/pybind/const_value.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b13ad42ea29453354798d88bff8ef47339d1a614
--- /dev/null
+++ b/paddle/pybind/const_value.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "const_value.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindConstValue(pybind11::module& m) {
+  m.def("kEmptyVarName", [] { return framework::kEmptyVarName; });
+  m.def("kTempVarName", [] { return framework::kTempVarName; });
+  m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
+  m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/pybind/const_value.h b/paddle/pybind/const_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d57c972a9d5339c0e155fa1d6395af9face8744
--- /dev/null
+++ b/paddle/pybind/const_value.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <Python.h>
+#include "paddle/platform/enforce.h"
+#include "pybind11/pybind11.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+extern void BindConstValue(pybind11::module& m);
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/pybind/exception.cc b/paddle/pybind/exception.cc
index ff79b12ee4b28c53ee04f4c170b5bca9ca28d14a..e29ac3ebab760a011a3798f8e4be46270d6b80cc 100644
--- a/paddle/pybind/exception.cc
+++ b/paddle/pybind/exception.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/pybind/exception.h"
 
diff --git a/paddle/pybind/exception.h b/paddle/pybind/exception.h
index 70beac146046f74e23f747bab130483901a7d443..436ddd5707ace37a5668c8d4401c1bdcf2dadfe3 100644
--- a/paddle/pybind/exception.h
+++ b/paddle/pybind/exception.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <Python.h>
diff --git a/paddle/pybind/print_operators_doc.cc b/paddle/pybind/print_operators_doc.cc
index 24f2a9383f7a069f1a8c7ed2bf3da46720470efa..99694fa592059d979297b72748125d02b2dd70a3 100644
--- a/paddle/pybind/print_operators_doc.cc
+++ b/paddle/pybind/print_operators_doc.cc
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #include <iostream>
 #include <sstream>  // std::stringstream
 #include <string>
@@ -31,31 +44,32 @@ std::string Escape(const std::string& s) {
   return r;
 }
 
-std::string AttrType(paddle::framework::AttrType at) {
+std::string AttrType(paddle::framework::proto::AttrType at) {
   switch (at) {
-    case paddle::framework::INT:
+    case paddle::framework::proto::INT:
       return "int";
-    case paddle::framework::FLOAT:
+    case paddle::framework::proto::FLOAT:
       return "float";
-    case paddle::framework::STRING:
+    case paddle::framework::proto::STRING:
       return "string";
-    case paddle::framework::BOOLEAN:
+    case paddle::framework::proto::BOOLEAN:
       return "bool";
-    case paddle::framework::INTS:
+    case paddle::framework::proto::INTS:
       return "int array";
-    case paddle::framework::FLOATS:
+    case paddle::framework::proto::FLOATS:
       return "float array";
-    case paddle::framework::STRINGS:
+    case paddle::framework::proto::STRINGS:
       return "string array";
-    case paddle::framework::BOOLEANS:
+    case paddle::framework::proto::BOOLEANS:
       return "bool array";
-    case paddle::framework::BLOCK:
+    case paddle::framework::proto::BLOCK:
       return "block id";
   }
   return "UNKNOWN";  // not possible
 }
 
-void PrintVar(const paddle::framework::OpProto::Var& v, std::stringstream& ss) {
+void PrintVar(const paddle::framework::proto::OpProto::Var& v,
+              std::stringstream& ss) {
   ss << " { "
      << "\n"
      << "   \"name\" : \"" << Escape(v.name()) << "\",\n"
@@ -65,7 +79,7 @@ void PrintVar(const paddle::framework::OpProto::Var& v, std::stringstream& ss) {
      << " },";
 }
 
-void PrintAttr(const paddle::framework::OpProto::Attr& a,
+void PrintAttr(const paddle::framework::proto::OpProto::Attr& a,
                std::stringstream& ss) {
   ss << " { "
      << "\n"
@@ -81,7 +95,7 @@ void PrintOpProto(const std::string& type,
                   std::stringstream& ss) {
   std::cerr << "Processing " << type << "\n";
 
-  const paddle::framework::OpProto* p = opinfo.proto_;
+  const paddle::framework::proto::OpProto* p = opinfo.proto_;
   if (p == nullptr) {
     return;  // It is possible that an operator doesn't have OpProto.
   }
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 5a1ff9b7976abbe4a37f8366181d9d1ae78ea4a0..4f959481537d29c089be24f9ae306f860c196c0f 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -108,21 +108,21 @@ static py::bytes SerializeMessage(T &self) {
 
 // Bind Methods
 void BindProgramDesc(py::module &m) {
-  py::class_<ProgramDescBind>(m, "ProgramDesc", "")
+  py::class_<ProgramDesc>(m, "ProgramDesc", "")
       .def(py::init<>())
       .def("__init__",
-           [](ProgramDescBind &self, const ProgramDescBind &other) {
-             new (&self) ProgramDescBind(other);
+           [](ProgramDesc &self, const ProgramDesc &other) {
+             new (&self) ProgramDesc(other);
            })
       .def("__init__",
-           [](ProgramDescBind &self, const py::bytes &binary_str) {
+           [](ProgramDesc &self, const py::bytes &binary_str) {
              std::string str(binary_str);
-             new (&self) ProgramDescBind(str);
+             new (&self) ProgramDesc(str);
            })
-      .def("append_block", &ProgramDescBind::AppendBlock,
+      .def("append_block", &ProgramDesc::AppendBlock,
            py::return_value_policy::reference)
       .def("append_backward",
-           [](ProgramDescBind &program_desc, const VarDescBind &target,
+           [](ProgramDesc &program_desc, const VarDesc &target,
               const std::unordered_set<std::string> &no_grad_vars) {
              ParamGradInfoMap param_grad_map =
                  AppendBackward(program_desc, target, no_grad_vars);
@@ -138,13 +138,13 @@ void BindProgramDesc(py::module &m) {
              }
              return retv;
            })
-      .def("block", &ProgramDescBind::MutableBlock,
+      .def("block", &ProgramDesc::MutableBlock,
            py::return_value_policy::reference)
-      .def("num_blocks", &ProgramDescBind::Size)
-      .def("serialize_to_string", SerializeMessage<ProgramDescBind>)
+      .def("num_blocks", &ProgramDesc::Size)
+      .def("serialize_to_string", SerializeMessage<ProgramDesc>)
       .def("parse_from_string",
-           [](ProgramDescBind &program_desc, const std::string &data) {
-             ProgramDesc *desc = program_desc.Proto();
+           [](ProgramDesc &program_desc, const std::string &data) {
+             proto::ProgramDesc *desc = program_desc.Proto();
              PADDLE_ENFORCE(desc->ParseFromString(data),
                             "Fail to parse ProgramDesc from string. This could "
                             "be a bug of Paddle.");
@@ -152,109 +152,135 @@ void BindProgramDesc(py::module &m) {
 }
 
 void BindBlockDesc(py::module &m) {
-  py::class_<BlockDescBind>(m, "BlockDesc", "")
-      .def_property_readonly("id", &BlockDescBind::ID)
-      .def_property_readonly("parent", &BlockDescBind::Parent)
-      .def("append_op", &BlockDescBind::AppendOp,
+  py::class_<BlockDesc>(m, "BlockDesc", "")
+      .def_property_readonly("id", &BlockDesc::ID)
+      .def_property_readonly("parent", &BlockDesc::Parent)
+      .def("append_op", &BlockDesc::AppendOp,
            py::return_value_policy::reference)
-      .def("prepend_op", &BlockDescBind::PrependOp,
+      .def("prepend_op", &BlockDesc::PrependOp,
            py::return_value_policy::reference)
+      .def("remove_op", &BlockDesc::RemoveOp)
       .def("var",
-           [](BlockDescBind &self, py::bytes byte_name) {
+           [](BlockDesc &self, py::bytes byte_name) {
              std::string name = byte_name;
              return self.Var(name);
            },
            py::return_value_policy::reference)
       .def("has_var",
-           [](BlockDescBind &self, py::bytes byte_name) {
+           [](BlockDesc &self, py::bytes byte_name) {
              std::string name = byte_name;
              return self.HasVar(name);
            })
+      .def("has_var_recursive",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.HasVarRecursive(name);
+           })
       .def("find_var",
-           [](BlockDescBind &self, py::bytes byte_name) {
+           [](BlockDesc &self, py::bytes byte_name) {
              std::string name = byte_name;
              return self.FindVar(name);
            },
            py::return_value_policy::reference)
-      .def("all_vars", &BlockDescBind::AllVars,
+      .def("find_var_recursive",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.FindVarRecursive(name);
+           },
            py::return_value_policy::reference)
-      .def("op_size", &BlockDescBind::OpSize)
-      .def("op", &BlockDescBind::Op, py::return_value_policy::reference)
-      .def("serialize_to_string", SerializeMessage<BlockDescBind>);
+      .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
+      .def("op_size", &BlockDesc::OpSize)
+      .def("op", &BlockDesc::Op, py::return_value_policy::reference)
+      .def("serialize_to_string", SerializeMessage<BlockDesc>);
 }
 
 void BindVarDsec(py::module &m) {
-  py::enum_<DataType>(m, "DataType", "")
-      .value("BOOL", DataType::BOOL)
-      .value("INT16", DataType::INT16)
-      .value("INT32", DataType::INT32)
-      .value("INT64", DataType::INT64)
-      .value("FP16", DataType::FP16)
-      .value("FP32", DataType::FP32)
-      .value("FP64", DataType::FP64);
+  py::enum_<proto::DataType>(m, "DataType", "")
+      .value("BOOL", proto::DataType::BOOL)
+      .value("INT16", proto::DataType::INT16)
+      .value("INT32", proto::DataType::INT32)
+      .value("INT64", proto::DataType::INT64)
+      .value("FP16", proto::DataType::FP16)
+      .value("FP32", proto::DataType::FP32)
+      .value("FP64", proto::DataType::FP64);
 
-  py::class_<VarDescBind> var_desc(m, "VarDesc", "");
+  py::class_<VarDesc> var_desc(m, "VarDesc", "");
   var_desc
       .def("name",
-           [](const VarDescBind &self) {
+           [](const VarDesc &self) {
              py::bytes name = self.Name();
              return name;
            },
            py::return_value_policy::reference)
-      .def("set_shape", &VarDescBind::SetShape)
-      .def("set_data_type", &VarDescBind::SetDataType)
-      .def("shape", &VarDescBind::Shape, py::return_value_policy::reference)
-      .def("data_type", &VarDescBind::GetDataType)
-      .def("lod_level", &VarDescBind::GetLodLevel)
-      .def("set_lod_level", &VarDescBind::SetLoDLevel)
-      .def("type", &VarDescBind::GetType)
-      .def("set_type", &VarDescBind::SetType)
-      .def("serialize_to_string", SerializeMessage<VarDescBind>)
-      .def("persistable", &VarDescBind::Persistable)
-      .def("set_persistable", &VarDescBind::SetPersistable);
+      .def("set_shape", &VarDesc::SetShape)
+      .def("set_dtype", &VarDesc::SetDataType)
+      .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
+      .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
+      .def("lod_level", &VarDesc::GetLoDLevel)
+      .def("set_lod_level", &VarDesc::SetLoDLevel)
+      .def("type", &VarDesc::GetType)
+      .def("set_type", &VarDesc::SetType)
+      .def("serialize_to_string", SerializeMessage<VarDesc>)
+      .def("persistable", &VarDesc::Persistable)
+      .def("set_persistable", &VarDesc::SetPersistable);
 
-  py::enum_<VarDesc::VarType>(var_desc, "VarType", "")
-      .value("LOD_TENSOR", VarDesc::LOD_TENSOR)
-      .value("SELECTED_ROWS", VarDesc::SELECTED_ROWS)
-      .value("FEED_MINIBATCH", VarDesc::FEED_MINIBATCH)
-      .value("FETCH_LIST", VarDesc::FETCH_LIST)
-      .value("STEP_SCOPES", VarDesc::STEP_SCOPES)
-      .value("LOD_RANK_TABLE", VarDesc::LOD_RANK_TABLE)
-      .value("LOD_TENSOR_ARRAY", VarDesc::LOD_TENSOR_ARRAY);
+  py::enum_<proto::VarDesc::VarType>(var_desc, "VarType", "")
+      .value("LOD_TENSOR", proto::VarDesc::LOD_TENSOR)
+      .value("SELECTED_ROWS", proto::VarDesc::SELECTED_ROWS)
+      .value("FEED_MINIBATCH", proto::VarDesc::FEED_MINIBATCH)
+      .value("FETCH_LIST", proto::VarDesc::FETCH_LIST)
+      .value("STEP_SCOPES", proto::VarDesc::STEP_SCOPES)
+      .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE)
+      .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY)
+      .value("PLACE_LIST", proto::VarDesc::PLACE_LIST);
 }
 
 void BindOpDesc(py::module &m) {
-  py::enum_<AttrType>(m, "AttrType", "")
-      .value("INT", AttrType::INT)
-      .value("INTS", AttrType::INTS)
-      .value("FLOAT", AttrType::FLOAT)
-      .value("FLOATS", AttrType::FLOATS)
-      .value("STRING", AttrType::STRING)
-      .value("STRINGS", AttrType::STRINGS)
-      .value("BOOL", AttrType::BOOLEAN)
-      .value("BOOLS", AttrType::BOOLEANS)
-      .value("BLOCK", AttrType::BLOCK);
+  py::enum_<proto::AttrType>(m, "AttrType", "")
+      .value("INT", proto::AttrType::INT)
+      .value("INTS", proto::AttrType::INTS)
+      .value("FLOAT", proto::AttrType::FLOAT)
+      .value("FLOATS", proto::AttrType::FLOATS)
+      .value("STRING", proto::AttrType::STRING)
+      .value("STRINGS", proto::AttrType::STRINGS)
+      .value("BOOL", proto::AttrType::BOOLEAN)
+      .value("BOOLS", proto::AttrType::BOOLEANS)
+      .value("BLOCK", proto::AttrType::BLOCK);
 
-  py::class_<OpDescBind> op_desc(m, "OpDesc", "");
-  op_desc.def("type", &OpDescBind::Type)
-      .def("set_type", &OpDescBind::SetType)
-      .def("input", &OpDescBind::Input)
-      .def("input_names", &OpDescBind::InputNames)
-      .def("set_input", &OpDescBind::SetInput)
-      .def("output", &OpDescBind::Output)
-      .def("output_names", &OpDescBind::OutputNames)
-      .def("set_output", &OpDescBind::SetOutput)
-      .def("has_attr", &OpDescBind::HasAttr)
-      .def("attr_type", &OpDescBind::GetAttrType)
-      .def("attr_names", &OpDescBind::AttrNames)
-      .def("set_attr", &OpDescBind::SetAttr)
-      .def("attr", &OpDescBind::GetAttr)
-      .def("set_block_attr", &OpDescBind::SetBlockAttr)
-      .def("block_attr", &OpDescBind::GetBlockAttr)
-      .def("check_attrs", &OpDescBind::CheckAttrs)
-      .def("infer_shape", &OpDescBind::InferShape)
-      .def("infer_var_type", &OpDescBind::InferVarType)
-      .def("serialize_to_string", SerializeMessage<OpDescBind>);
+  py::class_<OpDesc> op_desc(m, "OpDesc", "");
+  op_desc
+      .def("__init__", [](OpDesc &self) { new (&self) OpDesc(); },
+           py::return_value_policy::reference)
+      .def("copy_from", &OpDesc::CopyFrom)
+      .def("type", &OpDesc::Type)
+      .def("set_type", &OpDesc::SetType)
+      .def("input", &OpDesc::Input)
+      .def("input_names", &OpDesc::InputNames)
+      .def("output", &OpDesc::Output)
+      .def("output_names", &OpDesc::OutputNames)
+      .def("set_input", &OpDesc::SetInput)
+      .def("set_output", &OpDesc::SetOutput)
+      .def("input_arg_names", &OpDesc::InputArgumentNames)
+      .def("output_arg_names", &OpDesc::OutputArgumentNames)
+      .def("rename_input", &OpDesc::RenameInput)
+      .def("rename_output", &OpDesc::RenameOutput)
+      .def("has_attr", &OpDesc::HasAttr)
+      .def("attr_type", &OpDesc::GetAttrType)
+      .def("attr_names", &OpDesc::AttrNames)
+      .def("set_attr", &OpDesc::SetAttr)
+      .def("attr", &OpDesc::GetAttr)
+      .def("set_block_attr", &OpDesc::SetBlockAttr)
+      .def("set_serialized_attr",
+           [](OpDesc &self, const std::string &name,
+              const py::bytes &seriralized) {
+             std::string ser(seriralized);
+             self.SetAttr(name, ser);
+           })
+      .def("block_attr", &OpDesc::GetBlockAttr)
+      .def("check_attrs", &OpDesc::CheckAttrs)
+      .def("infer_shape", &OpDesc::InferShape)
+      .def("infer_var_type", &OpDesc::InferVarType)
+      .def("serialize_to_string", SerializeMessage<OpDesc>);
 }
 
 }  // namespace pybind
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 3d8d3f1d2fd3977f945928c723db5fcafffeae85..c5d70bc9f91bc92b28a546cc79b08a9fda150050 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -16,22 +16,21 @@ limitations under the License. */
 
 #include <mutex>  // for call_once
 #include <unordered_map>
-#include "gflags/gflags.h"
 #include "paddle/framework/backward.h"
 #include "paddle/framework/executor.h"
 #include "paddle/framework/feed_fetch_method.h"
 #include "paddle/framework/framework.pb.h"
+#include "paddle/framework/init.h"
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/prune.h"
 #include "paddle/framework/selected_rows.h"
-#include "paddle/framework/tensor_array.h"
 #include "paddle/operators/cond_op.h"
-#include "paddle/operators/dynamic_recurrent_op.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
+#include "paddle/pybind/const_value.h"
 #include "paddle/pybind/exception.h"
 #include "paddle/pybind/pybind.h"
 #include "paddle/pybind/tensor_py.h"
@@ -39,6 +38,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/operators/nccl/nccl_gpu_common.h"
+#include "paddle/platform/cuda_profiler.h"
 #include "paddle/platform/gpu_info.h"
 #endif
 
@@ -52,24 +52,6 @@ static size_t UniqueIntegerGenerator(const std::string &prefix) {
   return generators[prefix].fetch_add(1);
 }
 
-std::once_flag gflags_init_flag;
-
-// TODO(qijun) move init gflags to init.cc
-void InitGflags(std::vector<std::string> &argv) {
-  std::call_once(gflags_init_flag, [&]() {
-    int argc = argv.size();
-    char **arr = new char *[argv.size()];
-    std::string line;
-    for (size_t i = 0; i < argv.size(); i++) {
-      arr[i] = &argv[i][0];
-      line += argv[i];
-      line += ' ';
-    }
-    google::ParseCommandLineFlags(&argc, &arr, true);
-    VLOG(1) << "Init commandline: " << line;
-  });
-}
-
 bool IsCompileGPU() {
 #ifndef PADDLE_WITH_CUDA
   return false;
@@ -96,8 +78,12 @@ PYBIND11_PLUGIN(core) {
            [](Tensor &self, const std::vector<int64_t> &dim) {
              self.Resize(make_ddim(dim));
            })
+      .def("set_layout",
+           [](Tensor &self, const std::string &layout) {
+             self.set_layout(StringToDataLayout(layout));
+           })
       .def("alloc_float",
-           [](Tensor &self, paddle::platform::GPUPlace &place) {
+           [](Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<float>(place);
            })
       .def("alloc_float",
@@ -109,7 +95,7 @@ PYBIND11_PLUGIN(core) {
              self.mutable_data<int>(place);
            })
       .def("alloc_int",
-           [](Tensor &self, paddle::platform::GPUPlace &place) {
+           [](Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<int>(place);
            })
       .def("set", PyCPUTensorSetFromArray<float>)
@@ -283,16 +269,39 @@ All parameter, weight, gradient are variables in Paddle.
     }
     return ret_values;
   });
-  m.def("prune", [](const ProgramDescBind &origin,
+  m.def(
+      "get_grad_op_desc", [](const OpDesc &op_desc,
+                             const std::unordered_set<std::string> &no_grad_set,
+                             const std::vector<BlockDesc *> &grad_sub_block) {
+        std::unordered_map<std::string, std::string> grad_to_var;
+        std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
+            framework::OpInfoMap::Instance()
+                .Get(op_desc.Type())
+                .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
+                               grad_sub_block);
+        std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
+        std::transform(grad_op_descs.begin(), grad_op_descs.end(),
+                       grad_op_desc_ptrs.begin(),
+                       [](std::unique_ptr<OpDesc> &p) { return p.release(); });
+        return std::make_pair(grad_op_desc_ptrs, grad_to_var);
+      });
+  m.def("prune", [](const ProgramDesc &origin,
                     const std::vector<std::array<size_t, 2>> &targets) {
-    ProgramDescBind prog_with_targets(origin);
+    ProgramDesc prog_with_targets(origin);
     for (const auto &t : targets) {
       prog_with_targets.MutableBlock(t[0])->Op(t[1])->MarkAsTarget();
     }
-    ProgramDesc pruned_desc;
+    proto::ProgramDesc pruned_desc;
     Prune(*prog_with_targets.Proto(), &pruned_desc);
-    return new ProgramDescBind(pruned_desc);
+    return new ProgramDesc(pruned_desc);
+  });
+  m.def("inference_optimize", [](ProgramDesc &origin) {
+    proto::ProgramDesc pruned_desc;
+    InferenceOptimize(*(origin.Proto()), &pruned_desc);
+    return new ProgramDesc(pruned_desc);
   });
+  m.def("empty_var_name", []() { return framework::kEmptyVarName; });
+  m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; });
   m.def_submodule(
        "var_names",
        "The module will return special predefined variable name in Paddle")
@@ -306,10 +315,10 @@ All parameter, weight, gradient are variables in Paddle.
                     return new paddle::platform::CPUDeviceContext();
                   })
       .def_static("create",
-                  [](paddle::platform::GPUPlace& place)
+                  [](paddle::platform::CUDAPlace& place)
                       -> paddle::platform::DeviceContext* {
 #ifndef PADDLE_WITH_CUDA
-                    PADDLE_THROW("GPUPlace is not supported in CPU device.");
+                    PADDLE_THROW("CUDAPlace is not supported in CPU device.");
 #else
                     return new paddle::platform::CUDADeviceContext(place);
 #endif
@@ -319,9 +328,9 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_CUDA
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
-  py::class_<platform::GPUPlace>(m, "GPUPlace")
+  py::class_<platform::CUDAPlace>(m, "CUDAPlace")
       .def(py::init<int>())
-      .def("__str__", string::to_string<const platform::GPUPlace &>);
+      .def("__str__", string::to_string<const platform::CUDAPlace &>);
 
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
       .def(py::init<>())
@@ -334,14 +343,14 @@ All parameter, weight, gradient are variables in Paddle.
              self = cpu_place;
            })
       .def("set_place",
-           [](platform::Place &self, const platform::GPUPlace &gpu_place) {
+           [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
              self = gpu_place;
            });
 
   py::class_<OperatorBase>(m, "Operator")
       .def_static("create",
                   [](py::bytes protobin) {
-                    OpDesc desc;
+                    proto::OpDesc desc;
                     PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
                                    "Cannot parse user input to OpDesc");
                     PADDLE_ENFORCE(desc.IsInitialized(),
@@ -356,10 +365,10 @@ All parameter, weight, gradient are variables in Paddle.
            })
       .def("run",
            [](OperatorBase &self, const Scope &scope,
-              const platform::DeviceContext &dev_ctx) {
-             self.Run(scope, dev_ctx);
-             dev_ctx.Wait();
-           })
+              const platform::CPUPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::CUDAPlace &place) { self.Run(scope, place); })
       .def("type",
            [](const OperatorBase &op) -> std::string { return op.Type(); })
       .def("outputs",
@@ -390,88 +399,11 @@ All parameter, weight, gradient are variables in Paddle.
         self->CompleteAddOp();
       });
 
-  py::class_<framework::TensorArray>(m, "TensorArray")
-      .def("__init__",
-           [](TensorArray &instance) { new (&instance) TensorArray(); })
-      .def("read",
-           [](TensorArray &self, size_t index) { return self.Read(index); })
-      .def("write", [](TensorArray &self, size_t index,
-                       LoDTensor &value) { self.Write(index, value); })
-      .def("write_shared",
-           [](TensorArray &self, size_t index, const LoDTensor &value) {
-             self.WriteShared(index, value);
-           })
-      .def("size", [](TensorArray &self) { return self.size(); })
-      .def("pack",
-           [](TensorArray &self, size_t level,
-              const std::vector<std::vector<size_t>> &meta_info,
-              const std::vector<std::vector<size_t>> &lod) {
-             std::vector<DySeqMeta> meta;
-             for (auto &info : meta_info) {
-               PADDLE_ENFORCE_EQ(info.size(), 3UL);
-               meta.emplace_back(info[0], info[1], info[2]);
-             }
-#ifndef PADDLE_WITH_CUDA
-             return self.Pack(level, meta, lod);
-#else
-             LoD new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             return self.Pack(level, meta, new_lod);
-#endif
-           })
-      .def("unpack",
-           [](TensorArray &self, const LoDTensor &source, int level,
-              bool length_descend) {
-             auto metas = self.Unpack(source, level, length_descend);
-             std::vector<std::vector<size_t>> meta_info;
-             for (auto meta : metas) {
-               meta_info.emplace_back(
-                   std::vector<size_t>({meta.begin, meta.end, meta.ori_idx}));
-             }
-             return meta_info;
-           })
-      .def("stack", [](TensorArray &self) { return self.Stack(); })
-      .def("unstack",
-           [](TensorArray &self, const LoDTensor &source) {
-             return self.Unstack(source);
-           })
-      .def("unstack_shared", [](TensorArray &self, const LoDTensor &source) {
-        return self.UnstackShared(source);
-      });
-
-  py::class_<operators::DynamicRecurrentOp, OperatorBase>(m,
-                                                          "DynamicRecurrentOp")
-      .def_static("create",
-                  [](py::bytes protobin) -> operators::DynamicRecurrentOp * {
-                    OpDesc desc;
-                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
-                                   "Cannot parse user input to OpDesc");
-                    PADDLE_ENFORCE(desc.IsInitialized(),
-                                   "User OpDesc is not initialized, reason %s",
-                                   desc.InitializationErrorString());
-                    auto rnn_op = OpRegistry::CreateOp(desc);
-                    return static_cast<operators::DynamicRecurrentOp *>(
-                        rnn_op.release());
-                  })
-      .def("set_step_unit",
-           [](operators::DynamicRecurrentOp &self, const operators::NetOp &net)
-               -> void { self.rnn.SetStepUnit(net.Clone()); })
-      .def("get_state",
-           [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.rnn.state(name); })
-      .def("get_step_input",
-           [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.rnn.step_input(name); })
-      .def("get_step_output",
-           [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.rnn.step_output(name); });
-
   // cond_op
   py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
       .def_static("create",
                   [](py::bytes protobin) -> operators::CondOp * {
-                    OpDesc desc;
+                    proto::OpDesc desc;
                     PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
                                    "Cannot parse user input to OpDesc");
                     PADDLE_ENFORCE(desc.IsInitialized(),
@@ -490,13 +422,16 @@ All parameter, weight, gradient are variables in Paddle.
            });
 
   py::class_<framework::Executor>(m, "Executor")
-      .def(py::init<std::vector<platform::Place> &>())
+      .def(py::init<const platform::Place &>())
       .def("run", &Executor::Run);
 
   m.def("unique_integer", UniqueIntegerGenerator);
-  m.def("init_gflags", InitGflags);
+  m.def("init_gflags", framework::InitGflags);
+  m.def("init_glog", framework::InitGLOG);
+  m.def("init_devices", &framework::InitDevices);
 
   m.def("is_compile_gpu", IsCompileGPU);
+
   m.def("set_feed_variable", framework::SetFeedVariable);
   m.def("get_fetch_variable", framework::GetFetchVariable);
 
@@ -504,6 +439,7 @@ All parameter, weight, gradient are variables in Paddle.
   BindBlockDesc(m);
   BindVarDsec(m);
   BindOpDesc(m);
+  BindConstValue(m);
 
   py::class_<framework::LoDRankTable>(m, "LodRankTable")
       .def("items", [](framework::LoDRankTable &table) {
@@ -534,6 +470,10 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("op_support_gpu", OpSupportGPU);
 #ifdef PADDLE_WITH_CUDA
   m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
+
+  m.def("nvprof_init", platform::CudaProfilerInit);
+  m.def("nvprof_start", platform::CudaProfilerStart);
+  m.def("nvprof_stop", platform::CudaProfilerStop);
 #endif
 
   return m.ptr();
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index 41fa658502d341fe9653a3e99b58498fcaeada47..3b5210e2b91dd697e213a0c847f73c6969cd654b 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -1,21 +1,22 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <string>
-#include "paddle/framework/tensor.h"
+#include "paddle/framework/lod_tensor.h"
 #include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 
@@ -61,21 +62,25 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
         auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
         auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
             tensor.dims(), platform::CPUPlace()));
-        // TODO(qijun): Here we use default CUDA stream to set GPU Tensor to
-        // a Python numpy array. It's better to manage CDUA stream unifiedly.
-        paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
-                                        sizeof(CUR_TYPE) * tensor.numel(),
-                                        cudaMemcpyDeviceToHost);
+
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
+            pool.Get(tensor.place()));
+
+        paddle::platform::GpuMemcpyAsync(
+            dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
+            cudaMemcpyDeviceToHost, dev_ctx->stream());
 #else
-        PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
+        PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
 #endif
       } else if (paddle::platform::is_cpu_place(tensor.place())) {
         dst_tensor = tensor;
       }
-      return py::buffer_info(
-          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.place()),
-          sizeof(CUR_TYPE), py::format_descriptor<CUR_TYPE>::format(),
-          (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
+      return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
+                             py::format_descriptor<CUR_TYPE>::format(),
+                             (size_t)framework::arity(dst_tensor.dims()),
+                             dims_outside, strides);
     } else {
       constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
       return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
@@ -92,14 +97,27 @@ inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
 
 template <typename T>
 T TensorGetElement(framework::Tensor &self, size_t offset) {
-  PADDLE_ENFORCE(platform::is_cpu_place(self.place()));
-  return self.data<T>()[offset];
+  if (platform::is_cpu_place(self.place())) {
+    return self.data<T>()[offset];
+  } else {
+    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
+    framework::Copy(self, platform::CPUPlace(), dst.get());
+    return dst->data<T>()[offset];
+  }
 }
 
+// TODO(dzhwinter) : fix the redundent Tensor allocate and free
 template <typename T>
 void TensorSetElement(framework::Tensor &self, size_t offset, T elem) {
-  PADDLE_ENFORCE(platform::is_cpu_place(self.place()));
-  self.data<T>()[offset] = elem;
+  if (platform::is_gpu_place(self.place())) {
+    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
+    framework::Copy(self, platform::CPUPlace(), dst.get());
+    dst->data<T>()[offset] = elem;
+    framework::Copy(*dst.get(), self.place(), &self);
+
+  } else if (platform::is_cpu_place(self.place())) {
+    self.data<T>()[offset] = elem;
+  }
 }
 
 template <typename T>
@@ -123,7 +141,7 @@ template <typename T>
 void PyCUDATensorSetFromArray(
     framework::Tensor &self,
     py::array_t<T, py::array::c_style | py::array::forcecast> array,
-    paddle::platform::GPUPlace &place) {
+    paddle::platform::CUDAPlace &place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
@@ -132,10 +150,12 @@ void PyCUDATensorSetFromArray(
 
   self.Resize(framework::make_ddim(dims));
   auto *dst = self.mutable_data<T>(place);
-  // TODO(qijun): Here we use default CUDA stream to set a Python numpy
-  // array to a GPU Tensor. It's better to manage CDUA stream unifiedly.
-  paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
-                                  cudaMemcpyHostToDevice);
+
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto dev_ctx =
+      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
+  paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
+                                   cudaMemcpyHostToDevice, dev_ctx->stream());
 }
 #endif
 
diff --git a/paddle/scripts/CMakeLists.txt b/paddle/scripts/CMakeLists.txt
index a52f06fe497dac467e4ef2543ebda7a423ca326d..68cb5a19f99ab5148b04d193eb2356588bdc5a59 100644
--- a/paddle/scripts/CMakeLists.txt
+++ b/paddle/scripts/CMakeLists.txt
@@ -5,11 +5,3 @@ configure_file(submit_local.sh.in
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle DESTINATION bin
         PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
             GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
-
-configure_file(tools/usage_stat/usage.sh
-    paddle_usage
-    @ONLY)
-
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle_usage DESTINATION opt/paddle/bin
-        PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
diff --git a/paddle/scripts/check_env.sh b/paddle/scripts/check_env.sh
new file mode 100755
index 0000000000000000000000000000000000000000..af16b84ca8a18151f0fa36d39fd201d3cab21a5f
--- /dev/null
+++ b/paddle/scripts/check_env.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+
+if [ "`uname -s`" != "Linux" ]; then
+  echo "Current scenario only support in Linux yet!"
+  exit 0
+fi
+
+echo "========================= Hardware Information ========================="
+sockets=`grep 'physical id' /proc/cpuinfo | sort -u | wc -l`
+cores_per_socket=`grep 'core id' /proc/cpuinfo | sort -u | wc -l`
+ht=`lscpu |grep "per core" |awk -F':' '{print $2}'|xargs`
+physical_cores=$((sockets * cores_per_socket))
+virtual_cores=`grep 'processor' /proc/cpuinfo | sort -u | wc -l`
+numa_nodes=`lscpu |grep "NUMA node(s)"|awk -F':' '{print $2}'|xargs`
+echo "CPU Name               : `cat /proc/cpuinfo |grep -i "model name" |uniq |awk -F ':' '{print $2}'|xargs`"
+echo "CPU Family             : `lscpu |grep \"CPU family\" |awk -F':' '{print $2}'|xargs`"
+echo "Socket Number          : $sockets"
+echo "Cores Per Socket       : $cores_per_socket"
+echo "Total Physical Cores   : $physical_cores"
+echo "Total Virtual Cores    : $virtual_cores"
+if [ $ht -eq 1 ]; then
+  echo "Hyper Threading        : OFF"
+  if [ $physical_cores -ne $virtual_cores ]; then
+    echo "Error: HT logical error"
+  fi
+else
+  echo "Hyper Threading        : ON"
+  if [ $physical_cores -ge $virtual_cores ]; then
+    echo "Error: HT logical error"
+  fi
+fi
+echo "NUMA Nodes             : $numa_nodes"
+if [ $numa_nodes -lt $sockets ]; then
+  echo "Warning: NUMA node is not enough for the best performance,\
+ at least $sockets"
+fi
+
+echo "-------------------------- Memory Information --------------------------"
+# dmidecode support start from 2.11
+dmi_ver=`dmidecode --version|awk -F '.' '{print $1}'|xargs`
+if [ $dmi_ver -lt 2 ]; then
+  echo "Error: dmidecode unknown or version is too old"
+  exit 0
+fi
+if [ `dmidecode | grep -ic "Permission denied"` -ne 0 ]; then
+  echo "Error: need root to run dmidecode"
+  exit 0
+fi
+max_dimms=0
+num_dimms_installed=0
+for dimm_id in `dmidecode |grep Locator|sort -u | awk -F ':' '{print $2}'`; do
+  num_refered=`dmidecode |grep -wc "$dimm_id"`
+  # the actual dimm id should be refered only once
+  if [ $num_refered -eq 1 ]; then
+    num_unknown=`dmidecode | awk '/'$dimm_id'/ {s=1; f=0};
+      /Unknown/ {f=1};
+      /Manufacturer/ {if (s==1) {print f; exit 0;}};'`
+    if [ $num_unknown -eq 0 ]; then
+      dimms_installed="$dimms_installed \n $dimm_id"
+      ((num_dimms_installed++))
+    else
+      dimms_uninstalled="$dimms_uninstalled \n $dimm_id"
+    fi
+    ((max_dimms++))
+  fi
+done
+echo "Installed DIMM number  : $num_dimms_installed"
+num_dimms_mapped=`dmidecode | grep "Memory Device Mapped" | wc -l`
+if [ $num_dimms_installed -ne $num_dimms_mapped ]; then
+  echo "Error: The installed DIMMs number does ont match the mapped memory device: $num_dimms_mapped"
+fi
+num_clock_configed=`dmidecode | grep -i "Configured Clock Speed" |grep -ic "Hz"`
+if [ $num_dimms_installed -ne $num_clock_configed ]; then
+  echo "Error: The installed DIMMs number does ont match configured clocks: $num_clock_configed"
+fi
+echo -e "Installed DIMMs Locator: $dimms_installed"
+echo -e "Not installed DIMMs    : $dimms_uninstalled"
+max_dimm_slots=`dmidecode | grep -c "Bank Locator"`
+echo "DIMMs max slots        : $max_dimm_slots"
+if [ $max_dimms -ne $max_dimm_slots ]; then
+  echo "Error: The max dimm slots do not match the max dimms: $max_dimms"
+fi
+free_ver_main=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $1}'`
+free_ver_sub=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $2}'`
+if [ $free_ver_main -lt 3 ] || [ $free_ver_sub -lt 3 ]; then
+  mem_sz=`free |grep -i mem |awk -F' ' '{print $2}'|xargs`
+  swap_sz=`free |grep -i swap |awk -F' ' '{print $2}'|xargs`
+  total_sz=`free -t |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
+  mem_sz="`awk 'BEGIN{printf "%.1f\n",('$mem_sz'/1024/1024)}'` GB" 
+  swap_sz="`awk 'BEGIN{printf "%.1f\n",('$swap_sz'/1024/1024)}'` GB"
+  total_sz="`awk 'BEGIN{printf "%.1f\n",('$total_sz'/1024/1024)}'` GB"
+else
+  mem_sz=`free -h |grep -i mem |awk -F' ' '{print $2}'|xargs`
+  swap_sz=`free -h |grep -i swap |awk -F' ' '{print $2}'|xargs`
+  total_sz=`free -th |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
+fi
+echo "Memory Size            : $mem_sz"
+echo "Swap Memory Size       : $swap_sz"
+echo "Total Memory Size      : $total_sz"
+echo "Max Memory Capacity    : `dmidecode |grep -i \"maximum capacity\"|sort -u|awk -F':' '{print $2}'|xargs`"
+# DIMMs fequency
+clock_speeds=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | awk -F':' '{print $2}'|xargs`
+echo "Configed Clock Speed   : $clock_speeds"
+num_clock_type=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | wc -l`
+if [ $num_clock_type -ne 1 ]; then
+  echo "Warning: Have more than 1 speed type, all DIMMs should have same fequency: $clock_speeds"
+fi
+
+echo "-------------------------- Turbo Information  --------------------------"
+scaling_drive=`cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_driver`
+echo "Scaling Driver         : $scaling_drive"
+if [ $scaling_drive == "intel_pstate" ] && [ -e /sys/devices/system/cpu/intel_pstate/no_turbo ]; then
+  turbo=`cat /sys/devices/system/cpu/intel_pstate/no_turbo`
+  if [ $turbo -eq 1 ]; then
+    echo "Turbo Status           : OFF"
+  else
+    echo "Turbo Status           : ON"
+  fi
+else
+  echo "Warning: Scaling driver is not intel_pstarte, maybe should enable it in BIOS"
+  echo "Turbo Status           : Unknown"
+fi
+# cpu frequency
+num_max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| sort -u |wc -l`
+num_min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| sort -u |wc -l`
+if [ $num_max_freq -ne 1 ]; then
+  echo "Error: the max_frequency of all CPU should be equal"
+fi
+if [ $num_min_freq -ne 1 ]; then
+  echo "Error: the min_frequency of all CPU should be equal"
+fi
+max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| uniq|xargs` # kHz
+max_freq=`awk 'BEGIN{printf "%.2f",('$max_freq' / 1000000)}'` # GHz
+min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| uniq|xargs` # kHz
+min_freq=`awk 'BEGIN{printf "%.2f",('$min_freq' / 1000000)}'` # GHz
+echo "CPU Max Frequency      : $max_freq GHz"
+echo "CPU Min Frequency      : $min_freq GHz"
+# cpu governor
+num_governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |wc -l`
+if [ $num_governor -ne 1 ]; then
+  echo "Error: the governor of all CPU should be the same"
+fi
+governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |uniq`
+echo "CPU Freq Governor      : $governor"
+
+
+echo "========================= Software Information ========================="
+echo "BIOS Release Date      : `dmidecode | grep "Release Date"|awk -F ':' '{print $2}'|xargs`"
+echo "OS Version             : `cat /etc/redhat-release`"
+echo "Kernel Release Version : `uname -r`"
+echo "Kernel Patch Version   : `uname -v`"
+echo "GCC Version            :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`"
+if command -v cmake >/dev/null 2>&1; then 
+  cmake_ver=`cmake --version | head -n 1 | awk -F 'version' '{print $2}'`
+else
+  cmake_ver=" Not installed"
+fi
+echo "CMake Version          :$cmake_ver"
+echo "------------------ Environment Variables Information -------------------"
+kmp_affinity=`env | grep KMP_AFFINITY`
+omp_dynamic=`env | grep OMP_DYNAMIC`
+omp_nested=`env | grep OMP_NESTED`
+omp_num_threads=`env | grep OMP_NUM_THREADS`
+mkl_num_threads=`env | grep MKL_NUM_THREADS`
+mkl_dynamic=`env | grep MKL_DYNAMIC`
+if [ ! $kmp_affinity ]; then kmp_affinity="unset"; fi
+if [ ! $omp_dynamic ]; then omp_dynamic="unset"; fi
+if [ ! $omp_nested ]; then omp_nested="unset"; fi
+if [ ! $omp_num_threads ]; then omp_num_threads="unset"; fi
+if [ ! $mkl_num_threads ]; then mkl_num_threads="unset"; fi
+if [ ! $mkl_dynamic ]; then mkl_dynamic="unset"; fi
+echo "KMP_AFFINITY           : $kmp_affinity"
+echo "OMP_DYNAMIC            : $omp_dynamic"
+echo "OMP_NESTED             : $omp_nested"
+echo "OMP_NUM_THREADS        : $omp_num_threads"
+echo "MKL_NUM_THREADS        : $mkl_num_threads"
+echo "MKL_DYNAMIC            : $mkl_dynamic"
+# Check if any MKL related libraries have been installed in LD_LIBRARY_PATH
+for path in `echo $LD_LIBRARY_PATH | awk -F ':' '{for(i=1;i<=NF;++i)print $i}'`; do
+  mkldnn_found=`find $path -name "libmkldnn.so"`
+  if [ "$mkldnn_found" ]; then
+    echo "Found MKL-DNN          : $mkldnn_found"
+  fi
+  mklml_found=`find $path -name "libmklml_intel.so"`
+  if [ "$mklml_found" ]; then
+    echo "Found MKLML            : $mklml_found"
+  fi
+  iomp_found=`find $path -name "libiomp5.so"`
+  if [ "$iomp_found" ]; then
+    echo "Found IOMP             : $iomp_found"
+  fi
+done
+
+# dump all details for fully check
+lscpu > lscpu.dump
+dmidecode > dmidecode.dump
+
+# The expected result would be like:
+# ========================= Hardware Information =========================
+# CPU Name               : Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz
+# CPU Family             : 6
+# Socket Number          : 2
+# Cores Per Socket       : 20
+# Total Physical Cores   : 40
+# Total Virtual Cores    : 40
+# Hyper Threading        : OFF
+# NUMA Nodes             : 2
+# -------------------------- Memory Information --------------------------
+# Installed DIMM number  : 12
+# Installed DIMMs Locator:
+#  CPU1_DIMM_A1
+#  CPU1_DIMM_B1
+#  CPU1_DIMM_C1
+#  CPU1_DIMM_D1
+#  CPU1_DIMM_E1
+#  CPU1_DIMM_F1
+#  CPU2_DIMM_A1
+#  CPU2_DIMM_B1
+#  CPU2_DIMM_C1
+#  CPU2_DIMM_D1
+#  CPU2_DIMM_E1
+#  CPU2_DIMM_F1
+# Not installed DIMMs    :
+#  CPU1_DIMM_A2
+#  CPU1_DIMM_B2
+#  CPU1_DIMM_C2
+#  CPU1_DIMM_D2
+#  CPU1_DIMM_E2
+#  CPU1_DIMM_F2
+#  CPU2_DIMM_A2
+#  CPU2_DIMM_B2
+#  CPU2_DIMM_C2
+#  CPU2_DIMM_D2
+#  CPU2_DIMM_E2
+#  CPU2_DIMM_F2
+# DIMMs max slots        : 24
+# Memory Size            : 376G
+# Swap Memory Size       : 4.0G
+# Total Memory Size      : 380G
+# Max Memory Capacity    : 2304 GB
+# Configed Clock Speed   : 2666 MHz
+# -------------------------- Turbo Information  --------------------------
+# Scaling Driver         : intel_pstate
+# Turbo Status           : ON
+# CPU Max Frequency      : 3.70 GHz
+# CPU Min Frequency      : 1.00 GHz
+# CPU Freq Governor      : performance
+# ========================= Software Information =========================
+# BIOS Release Date      : 03/10/2017
+# OS Version             : CentOS Linux release 7.3.1611 (Core)
+# Kernel Release Version : 3.10.0-514.el7.x86_64
+# Kernel Patch Version   : #1 SMP Tue Nov 22 16:42:41 UTC 2016
+# GCC Version            : 4.8.5 20150623 (Red Hat 4.8.5-11)
+# CMake Version          : 3.5.2
+# ------------------ Environment Variables Information -------------------
+# KMP_AFFINITY           : unset
+# OMP_DYNAMIC            : unset
+# OMP_NESTED             : unset
+# OMP_NUM_THREADS        : unset
+# MKL_NUM_THREADS        : unset
+# MKL_DYNAMIC            : unset
diff --git a/paddle/scripts/cluster_train/paddle.py b/paddle/scripts/cluster_train/paddle.py
index 9b03ed1d8f6a28259a6cb45f096575b5f3d27ca7..e44bb4505b924b3a955bb5e740aa498f253d7556 100644
--- a/paddle/scripts/cluster_train/paddle.py
+++ b/paddle/scripts/cluster_train/paddle.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/python
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
index 1a2d19e823541750830fcaa25f65b2f8e1ea2b49..c2f631bdf4ed52a5dfa3fbcf1157d0abbdeadb9b 100644
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
@@ -1,7 +1,7 @@
 # Build this image:  docker build -t mpi .
 #
 
-FROM paddledev/paddle:0.10.0rc3
+FROM paddlepaddle/paddle:0.10.0rc3
 
 ENV DEBIAN_FRONTEND noninteractive
 
diff --git a/paddle/scripts/cpplint.py b/paddle/scripts/cpplint.py
index dff4339ea33b72e22104a56183e3302067dc583d..d0cbb070c432f375430de261fd0eb3233fe26df7 100644
--- a/paddle/scripts/cpplint.py
+++ b/paddle/scripts/cpplint.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/env python
 #
 # Copyright (c) 2009 Google Inc. All rights reserved.
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
index f3a6f1dba7588c6b29c1dcae26ec134c1a7f937d..f0620498cfa6775ce2949cc02fa9f6c9529dec2e 100644
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -20,7 +20,7 @@ binaries.
 
 ## Run The Build
 
-### Build Evironments
+### Build Environments
 
 The pre-built build environment images are:
 
@@ -192,7 +192,7 @@ For developers who are interested in the C++ source code, please use -e "WOBOQ=O
 - The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
 
 ```bash
-docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" -e "WOBOQ=ON" paddlepaddle/paddle:latest-dev
+docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" -e "WOBOQ=ON" paddlepaddle/paddle:latest-dev
 ```
 
 - You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index fda2a2f1b764106a7a108e8c56bc90ce3459e9b5..e70d04d9017e9e36bbd55d6a28889d9ba7fb2a13 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -16,11 +16,13 @@ function cmake_gen() {
         echo "using python abi: $1"
         if [ "$1" == "cp27-cp27m" ]; then
             export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
+            export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
             PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
         -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
         -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
         elif [ "$1" == "cp27-cp27mu" ]; then
             export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
+            export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
             PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
         -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
         -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
@@ -34,6 +36,7 @@ function cmake_gen() {
         ${PYTHON_FLAGS}
         -DWITH_DOC=OFF
         -DWITH_GPU=${WITH_GPU:-OFF}
+        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
         -DWITH_MKL=${WITH_MKL:-ON}
         -DWITH_AVX=${WITH_AVX:-OFF}
         -DWITH_GOLANG=${WITH_GOLANG:-ON}
@@ -55,6 +58,7 @@ EOF
         ${PYTHON_FLAGS} \
         -DWITH_DOC=OFF \
         -DWITH_GPU=${WITH_GPU:-OFF} \
+        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
         -DWITH_GOLANG=${WITH_GOLANG:-ON} \
@@ -109,7 +113,10 @@ EOF
             -DWITH_SWIG_PY=ON \
             -DWITH_STYLE_CHECK=OFF
         make -j `nproc` gen_proto_py
+        make -j `nproc` paddle_python
         make -j `nproc` paddle_docs paddle_docs_cn
+        make -j `nproc` print_operators_doc
+        paddle/pybind/print_operators_doc > doc/en/html/operators.json
         popd
     fi
 
@@ -171,7 +178,7 @@ EOF
     # run paddle version to install python packages first
     RUN apt-get update &&\
         ${NCCL_DEPS}\
-        apt-get install -y wget python-pip && pip install -U pip && \
+        apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip && \
         pip install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f /*.whl && \
@@ -186,6 +193,16 @@ EOF
 EOF
 }
 
+function gen_capi_package() {
+  if [[ ${WITH_C_API} == "ON" ]]; then
+    install_prefix="/paddle/build/capi_output"
+    rm -rf $install_prefix
+    make DESTDIR="$install_prefix" install
+    cd $install_prefix/usr/local
+    ls | egrep -v "^Found.*item$" | xargs tar -cf /paddle/build/paddle.tgz
+  fi
+}
+
 set -xe
 
 cmake_gen ${PYTHON_ABI:-""}
@@ -193,6 +210,11 @@ run_build
 run_test
 gen_docs
 gen_dockerfile
-
-printf "If you need to install PaddlePaddle in develop docker image,"
-printf "please make install or pip install build/python/dist/*.whl.\n"
+gen_capi_package
+
+if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
+  printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n" 
+else
+  printf "If you need to install PaddlePaddle in develop docker image,"
+  printf "please make install or pip install build/python/dist/*.whl.\n"
+fi
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index d71cb84df3785008ea5793519fc26a174e1b95f7..80fa0c72af65cbdc21ba955389318a233e02657c 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -49,7 +49,18 @@ function cpu_config() {
   if [ "@WITH_MKL@" == "OFF" ]; then
     return 0
   fi
-  ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+  platform="`uname -s`"
+  ht=0
+  if [ $platform == "Linux" ]; then
+    ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+  elif [ $platform == "Darwin" ]; then
+    if [`sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu`]; then
+      # HT is OFF
+      ht=1
+    fi
+  else
+    return 0
+  fi
   if [ $ht -eq 1 ]; then # HT is OFF
     if [ -z "$KMP_AFFINITY" ]; then
       export KMP_AFFINITY="granularity=fine,compact,0,0"
@@ -71,10 +82,16 @@ function threads_config() {
   # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
   # according to trainer_count and total processors
   # only when MKL enabled
-  if [ "@WITH_MKL@" == "OFF" ]; then
+  # auto set OPENBLAS_NUM_THREADS when do not use MKL
+  platform="`uname -s`"
+  processors=0
+  if [ $platform == "Linux" ]; then
+    processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
+  elif [ $platform == "Darwin" ]; then
+    processors=`sysctl -n hw.logicalcpu`
+  else
     return 0
   fi
-  processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
   trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
   if [ -z $trainers ]; then
     trainers=1
@@ -83,12 +100,22 @@ function threads_config() {
   if [ $threads -eq 0 ]; then
     threads=1
   fi
-  if [ -z "$OMP_NUM_THREADS" ]; then
-    export OMP_NUM_THREADS=$threads
-  fi
-  if [ -z "$MKL_NUM_THREADS" ]; then
-    export MKL_NUM_THREADS=$threads
+  if [ "@WITH_MKL@" == "ON" ]; then
+    if [ -z "$OMP_NUM_THREADS" ]; then
+      export OMP_NUM_THREADS=$threads
+    fi
+    if [ -z "$MKL_NUM_THREADS" ]; then
+      export MKL_NUM_THREADS=$threads
+    fi
+  else
+    if [ -z "$OPENBLAS_NUM_THREADS" ]; then
+      export OPENBLAS_NUM_THREADS=$threads
+    fi
+    if [ $threads -gt 1 ] && [ -z "$OPENBLAS_MAIN_FREE" ]; then
+      export OPENBLAS_MAIN_FREE=1
+    fi
   fi
+  
 }
 
 PADDLE_CONF_HOME="$HOME/.config/paddle"
@@ -146,7 +173,7 @@ cpu_config
 case "$1" in
     "train")
         threads_config $@
-        # echo $OMP_NUM_THREADS $MKL_NUM_THREADS
+        # echo $OMP_NUM_THREADS $MKL_NUM_THREADS $OPENBLAS_NUM_THREADS
         ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2}
         ;;
     "merge_model")
@@ -161,9 +188,6 @@ case "$1" in
     "make_diagram")
         python -m paddle.utils.make_model_diagram ${@:2}
         ;;
-    "usage")
-        $PADDLE_BIN_PATH/paddle_usage ${@:2}
-        ;;
     "version")
         version
         ;;
diff --git a/paddle/scripts/tools/build_docs/build_docs.sh b/paddle/scripts/tools/build_docs/build_docs.sh
index c6cbbc4eef94fb2e2fc3c1ce71734fbb23fc22d7..f9bc8bf63ae9afdfca1ff660bc83e62e71f03005 100755
--- a/paddle/scripts/tools/build_docs/build_docs.sh
+++ b/paddle/scripts/tools/build_docs/build_docs.sh
@@ -5,4 +5,4 @@ docker run --rm \
        -e "WITH_AVX=ON" \
        -e "WITH_DOC=ON" \
        -e "WOBOQ=ON" \
-       ${1:-"paddledev/paddle:dev"}
+       ${1:-"paddlepaddle/paddle:latest-dev"}
diff --git a/paddle/scripts/tools/usage_stat/usage.sh b/paddle/scripts/tools/usage_stat/usage.sh
deleted file mode 100755
index 7dbd1f58842f50ea1df0e2476c4a493569b1dda9..0000000000000000000000000000000000000000
--- a/paddle/scripts/tools/usage_stat/usage.sh
+++ /dev/null
@@ -1,168 +0,0 @@
-#!/bin/bash
-
-ARGPARSE=`getopt -o u:vin:l:e: --long git-user:,help,dry-run,task-name:,log-file:,exit-code:  -- "$@"`
-KEEP_ANONYMOUS="A_USER_DOES_NOT_TELL_US"
-# paddle config home dir, same as paddle
-PADDLE_CONF_HOME="$HOME/.config/paddle"
-# api url, mirror url(s) will be append later
-PD_URLS="http://api.paddlepaddle.org/version"
-
-usage()
-{
-    echo "Usage: `basename $0` [options]"
-    echo "Options:"
-    echo "  -e, --exit-code=EXIT_CODE         The train/predict process's exit code"
-    echo "  -l, --log-file=LOG_FILE_PATH      Read which log file to get the duration of process"
-    echo "  -n, --task-name=TASK_NAME         The name of demo or example"
-    echo "  -u, --git-user=GITHUB_USER        provide contact info, like username or email"
-    echo "  -v, -i                            Verbose output and interact with user when necessary"
-    echo " --help                             display this help message"
-}
-
-eval set -- "${ARGPARSE}"
-while true; do
-    case "$1" in
-        -l|--log-file)
-            log_file=$2
-            shift 2
-            ;;
-        -e|--exit-code)
-            exit_code=$2
-            shift 2
-            ;;
-        -u|--git-user)
-            github_user=$2
-            shift 2
-            ;;
-        -n|--task-name)
-            task=$2
-            shift 2
-            ;;
-        -v|-i)
-            v=1
-            shift
-            ;;
-        --dry-run)
-            dry_run=1
-            shift
-            ;;
-        --)
-            shift
-            break
-            ;;
-        --help)
-            usage
-            exit 0
-            ;;
-        *)
-            echo "Invalid option $1"
-            usage
-            exit 1
-            ;;
-    esac
-done
-
-# parse the log_file to get the time costs
-if [ -s "${log_file}" ]; then
-    duration=`awk 'BEGIN{day=0;last_sec=0;min_sec=0;max_sec=0;}
-    {if(index($2,":")==3){
-        t=substr($2,1,8);
-        sec=day*86400+substr(t,1,2)*3600+substr(t,4,2)*60+substr(t,7,2);
-        if(sec<last_sec-600){day+=1;sec+=86400;}
-        last_sec=sec;
-        if(min_sec==0 || min_sec>sec){min_sec=sec;}
-        if(max_sec==0 || max_sec<sec){max_sec=sec;}
-    }}
-    END{print max_sec-min_sec}' ${log_file}`
-else
-    duration=-1
-fi
-if [ "${v}" = "1" ]; then echo "duration: ${duration}"; fi
-
-# try find the user/email if not given
-if [ -z "${github_user}" ]; then
-    # search for cached username
-    if [ -s "${PADDLE_CONF_HOME}/github_user" ]; then
-        if [ "${v}" = "1" ]; then echo "read github_user from cache..."; fi
-        github_user=`cat ${PADDLE_CONF_HOME}/github_user`
-    else
-        # search the github-user from git config
-        if [ "${v}" = "1" ]; then echo "read github_user from git..."; fi
-        git_username=`git config --get user.name 2>/dev/null`
-        git_url=`git config --get remote.origin.url 2>/dev/null`
-        if [ "`echo ${git_url} | cut -b 1-19`" = "https://github.com/" ]; then
-            # under a git url, like https://github.com/user_xxx/proj_yyy.git
-            if [ "${v}" = "1" ]; then echo " from github url..."; fi
-            github_user=`echo ${git_url} | cut -d "/" -f 4`
-            if [ "${github_user}" = "PaddlePaddle" ]; then
-                github_user=
-            fi
-        fi
-        if [ -n "${git_username}" -a -z "${github_user}" ]; then
-            if [ "${v}" = "1" ]; then echo " from global git username..."; fi
-            github_user=${git_username}
-        fi
-    fi
-fi
-# allow user to set the user name, if it's not found
-if [ -z "${github_user}" -a "${v}" = "1" ]; then
-    read -p "Please input your github username or email, or just return to keep this feedback anonymous:"
-    github_user=${REPLY}
-    if [ -z "${github_user}" ]; then
-        # empty input, consider as one anonymous user
-        github_user="${KEEP_ANONYMOUS}"
-    fi
-fi
-if [ -n "${github_user}" -a -z "${dry_run}" ]; then
-    # valid user and not in dry-run mode, then save to cache
-    mkdir -p ${PADDLE_CONF_HOME}
-    echo "${github_user}" >${PADDLE_CONF_HOME}/github_user
-fi
-if [ "${v}" = "1" ]; then echo "username: ${github_user}"; fi
-if [ "${github_user}" = "${KEEP_ANONYMOUS}" ]; then
-    # anonymous user should keep the var empty.
-    github_user=
-fi
-
-# read local paddle version
-paddle_version=`paddle version | grep PaddlePaddle | head -n1 | cut -d " " -f 2 | cut -d "," -f 1`
-if [ "${v}" = "1" ]; then echo "version:${paddle_version}"; fi
-
-# read local system time
-system_time=`date "+%Y%m%d%H%M%S"`
-if [ "${v}" = "1" ]; then echo "system time:${system_time}"; fi
-
-# make empty job_name as default value.
-if [ -z "${task}" ]; then
-    task="(unknown_task)"
-fi
-if [ "${v}" = "1" ]; then echo "task: ${task}"; fi
-
-# concat the curl command
-params="content={\"data_type\":\"usage\",\
-\"system_time\":${system_time},\"paddle_version\":\"${paddle_version}\",\
-\"github_user\":\"${github_user}\",\"job_name\":\"${task}\",\
-\"duration\":${duration},\"exit_code\":\"${exit_code}\"\
-}&type=1"
-curl_cmd_prefix="curl -m 5 -X POST -d ${params}\
- -b ${PADDLE_CONF_HOME}/paddle.cookie -c ${PADDLE_CONF_HOME}/paddle.cookie "
-
-if [ "${dry_run}" = "1" ]; then
-    first_url=`echo ${PD_URLS} | cut -d " " -f 1`
-    echo "(dry-run mode)curl command: ${curl_cmd_prefix} ${first_url}"
-    exit 0
-else
-    for u in ${PD_URLS}; do
-        curl_cmd="${curl_cmd_prefix} ${u}"
-        if [ "${v}" = "1" ]; then echo "run: ${curl_cmd}"; fi
-        ${curl_cmd} >/dev/null 2>&1
-        if [ $? -eq 0 ]; then
-            if [ "${v}" = "1" ]; then echo "upload OK!"; fi
-            exit 0
-        else
-            if [ "${v}" = "1" ]; then echo "upload failed...try next"; fi
-        fi
-    done
-    if [ "${v}" = "1" ]; then echo "all urls tried but all failed...exit"; fi
-    exit 1
-fi
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index 28d82343ed32273740d0c52d0451681e43b3675e..0db8d33bbcb5278ed0dd5584b5822502b719ede9 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -8,7 +8,10 @@ cd $TRAVIS_BUILD_DIR/build
 # Compile Documentation only.
 cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 make -j `nproc` gen_proto_py
+make -j `nproc` paddle_python
 make -j `nproc` paddle_docs paddle_docs_cn
+make -j `nproc` print_operators_doc
+paddle/pybind/print_operators_doc > doc/en/html/operators.json
 
 # check websites for broken links
 linkchecker doc/en/html/index.html
diff --git a/paddle/string/piece.cc b/paddle/string/piece.cc
index b80afdec82d642fd3a8245b96ce1bb2bea17cbae..2a553e2832014f4de00c5613d9174edac69167a6 100644
--- a/paddle/string/piece.cc
+++ b/paddle/string/piece.cc
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 /*
   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
diff --git a/paddle/string/piece.h b/paddle/string/piece.h
index 7362ce02c7c80e121218fab77d87696403b1c5e8..fc95263379280221256dd0f696d2cb9999721d86 100644
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 /*
   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
diff --git a/paddle/string/piece_test.cc b/paddle/string/piece_test.cc
index cf5152ff5a3cb0a2afae0c90b787abf291122fa3..fb8b9729880a01ee2e597a575d7da15313a471d9 100644
--- a/paddle/string/piece_test.cc
+++ b/paddle/string/piece_test.cc
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 /*
   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
diff --git a/paddle/string/printf.h b/paddle/string/printf.h
index 8b5ce63a8e8dfe77962ff1e7415911d381a28aac..70d2511531698b03807d751dcec907ba857354f2 100644
--- a/paddle/string/printf.h
+++ b/paddle/string/printf.h
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 /*
   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
diff --git a/paddle/string/printf_test.cc b/paddle/string/printf_test.cc
index 2586264046a2e2ba24b0908c1f6eba163cdef448..b5ad35513bdcbccd5dbf951a3a8e5422f6424b26 100644
--- a/paddle/string/printf_test.cc
+++ b/paddle/string/printf_test.cc
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #include "paddle/string/printf.h"
 
 #include <string>
diff --git a/paddle/string/tinyformat/tinyformat.h b/paddle/string/tinyformat/tinyformat.h
index 3516777d9f9669c1e1300b9136c26e61f65b14a7..092c04c3153dfcec4c7c577ea1ac4e8a4a1359a0 100644
--- a/paddle/string/tinyformat/tinyformat.h
+++ b/paddle/string/tinyformat/tinyformat.h
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 // tinyformat.h
 // Copyright (C) 2011, Chris Foster [chris42f (at) gmail (d0t) com]
 //
diff --git a/paddle/string/to_string.h b/paddle/string/to_string.h
index 4f478b6a36b23bdba8ef3ddae94b3eadf18716c2..3b3bcc69a478045156225728236174fd601461dd 100644
--- a/paddle/string/to_string.h
+++ b/paddle/string/to_string.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include <sstream>
diff --git a/paddle/string/to_string_test.cc b/paddle/string/to_string_test.cc
index 971484dd0c073762e99f3926576eb21b96197769..4956bd96fad5fd1decaad0a367135cb7d7ecaf6e 100644
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/string/to_string.h"
 #include <gtest/gtest.h>
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 4245df5ab72bf0fd67261818b307f0babdb5d685..77f84cd43bdf35ae6f54b0db2b5f720d24872878 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -5,4 +5,7 @@ if(WITH_TESTING)
   add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies})
   add_library(paddle_test_util STATIC TestUtil.cpp)
   add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
+  if(NOT MOBILE_INFERENCE)
+    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init paddle_memory gtest gflags)
+  endif()
 endif()
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a7fb50ee4149a3c36077f83383f45f3106e7e0f1
--- /dev/null
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstring>
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/framework/init.h"
+#include "paddle/memory/memory.h"
+
+int main(int argc, char** argv) {
+  std::vector<char*> new_argv;
+  std::string gflags_env;
+  new_argv.push_back(argv[0]);
+#ifdef PADDLE_WITH_CUDA
+  new_argv.push_back(
+      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
+#else
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
+#endif
+  int new_argc = static_cast<int>(new_argv.size());
+  char** new_argv_address = new_argv.data();
+  google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
+  testing::InitGoogleTest(&argc, argv);
+  paddle::memory::Used(paddle::platform::CPUPlace());
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::memory::Used(paddle::platform::CUDAPlace(0));
+#endif
+
+  paddle::framework::InitDevices();
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt
index 3d471a0c01ca17cb98272159baf6d489c18824d5..72911695bd4959d73d783897b0c5e674454c30bc 100644
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -54,7 +54,7 @@ if(WITH_TESTING)
   add_subdirectory(tests)
 endif()
 
-if(NOT WITH_C_API)
+if(NOT MOBILE_INFERENCE)
   add_paddle_exe(paddle_trainer TrainerMain.cpp)
   add_paddle_exe(paddle_merge_model MergeModel.cpp)
 
@@ -74,7 +74,5 @@ endif()
 if(WITH_GOLANG)
   add_dependencies(paddle_trainer_lib paddle_pserver_cclient)
   target_link_libraries(paddle_trainer_lib paddle_pserver_cclient)
-  if(NOT WITH_C_API)
-    target_link_libraries(paddle_trainer paddle_pserver_cclient)
-  endif()
+  target_link_libraries(paddle_trainer paddle_pserver_cclient)
 endif(WITH_GOLANG)
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 88e684849df6fbfe4042b92bdb76ef98159eecea..3e4a2b5fa8a3981f6362edc1dc61ae1616e257ef 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -138,7 +138,7 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
   }
 
   if (FLAGS_use_mkldnn) {
-    CHECK_EQ(FLAGS_trainer_count, 1UL) << "MKLDNN only need 1 trainer";
+    CHECK_EQ(FLAGS_trainer_count, 1) << "MKLDNN only need 1 trainer";
   }
 
   if (testing) {
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
index a0a365aa0bb0ac26939a02c1cd626d0c17c6a9fe..2b68d89e48a3efd5de205ce33643b7e6320a4303 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -29,6 +29,7 @@ DECLARE_bool(with_gpu);
 DECLARE_bool(parallel_nn);
 DECLARE_string(config_args);
 DECLARE_bool(use_mkldnn);
+DECLARE_bool(use_mkl_packed);
 
 const char *kConfigParserModuleName = "paddle.trainer.config_parser";
 const char *kConfigParserFuncName = "parse_config_and_serialize";
@@ -46,6 +47,7 @@ TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
              << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu
              << ",parallel_nn=" << FLAGS_parallel_nn
              << ",use_mkldnn=" << FLAGS_use_mkldnn
+             << ",use_mkl_packed=" << FLAGS_use_mkl_packed
              << ",cudnn_version=" << hl_get_cudnn_lib_version();
   if (!FLAGS_config_args.empty()) {
     configArgs << "," << FLAGS_config_args;
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index 2739878b7f2936ea2da689da0b4caa780516ccc1..bd518d8598f5aa7c32298ed2110a96a2743536b3 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -1,19 +1,17 @@
-################# test_Compare ############################
-add_unittest_without_exec(test_Compare
-    test_Compare.cpp)
-add_test(NAME test_Compare
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python
-        ${CMAKE_CURRENT_BINARY_DIR}/test_Compare
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+set(PYTHON_PATH 
+   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
+   ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests)
+function(trainer_test TARGET)
+  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
+  add_test(NAME ${TARGET}
+    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endfunction()
 
-################# test_Trainer ###########################
-add_unittest_without_exec(test_Trainer
-    test_Trainer.cpp)
-add_test(NAME test_Trainer
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_Trainer
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+trainer_test(test_Compare)
+trainer_test(test_PyDataProviderWrapper)
+trainer_test(test_recurrent_machine_generation)
+trainer_test(test_Trainer)
 
 ############### test_TrainerOnePass ##########################
 if(WITH_PYTHON)
@@ -22,32 +20,13 @@ if(WITH_PYTHON)
   add_unittest_without_exec(test_TrainerOnePass
       test_TrainerOnePass.cpp)
   add_test(NAME test_TrainerOnePass
-    COMMAND  ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
-          ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests
-          ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
+    COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port 
+          ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
       WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 endif()
 
-################# test_recurrent_machine_generation ###############
-add_unittest_without_exec(test_recurrent_machine_generation
-    test_recurrent_machine_generation.cpp)
-add_test(NAME test_recurrent_machine_generation
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_recurrent_machine_generation
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-
-#################### test_PyDataProviderWrapper #########################
-add_unittest_without_exec(test_PyDataProviderWrapper
-    test_PyDataProviderWrapper.cpp)
-
-add_test(NAME test_PyDataProviderWrapper
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
-        ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests
-        ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProviderWrapper
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-
 #################### test_config_parser #########################
 add_test(NAME test_config_parser
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
+  COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE} 
+        ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
diff --git a/paddle/trainer/tests/simple_sparse_neural_network.py b/paddle/trainer/tests/simple_sparse_neural_network.py
index 30346ef299d0bc8585ccff7f2fc4885b0d9f9dfc..ba554d5872d5a6e94526eb4d09f62a4f2f21ff30 100644
--- a/paddle/trainer/tests/simple_sparse_neural_network.py
+++ b/paddle/trainer/tests/simple_sparse_neural_network.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=17, learning_method=AdaGradOptimizer(), learning_rate=1e-4)
diff --git a/paddle/trainer/tests/simple_sparse_neural_network_dp.py b/paddle/trainer/tests/simple_sparse_neural_network_dp.py
index 86b272edfe1bbb23c45cffe282f6475ceaa0cc41..44e96873f0d38b4349f893b9f3daf53f893869cc 100644
--- a/paddle/trainer/tests/simple_sparse_neural_network_dp.py
+++ b/paddle/trainer/tests/simple_sparse_neural_network_dp.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer.PyDataProvider2 import provider, integer_sequence, integer_value
 import random
 
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index 8f100f02e90bcbc7fdcf6f053aec6f95cfb09c1a..ea47cf23eb6e56082eeb92f3c6dff8d03be0d679 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -20,13 +20,20 @@ DEFINE_bool(use_gpu, false, "Only support CPU training");
 DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
 #endif
 
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
 // TODO(TJ): change to true when MKLDNN layers support multi-inputs
 DEFINE_bool(use_mkldnn, false, "Default still keep use CPU training");
 #else
 DEFINE_bool(use_mkldnn, false, "Only support CPU training");
 #endif
 
+#ifdef PADDLE_WITH_MKLML
+// TODO(TJ): change to true when fully confirmed
+DEFINE_bool(use_mkl_packed, false, "Whether to use MKL Packed Optimization");
+#else
+DEFINE_bool(use_mkl_packed, false, "Not to use MKL Packed Optimization");
+#endif
+
 DEFINE_bool(parallel_nn,
             false,
             "Whether to use multi-threads to calculate one neural network."
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
index 1832bb515ec85df3d7733e01b063a01ad6a3b282..b64295bca09a199f24605a158d1d9db7e7d91660 100644
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -41,3 +41,4 @@ DECLARE_string(predict_file);
 DECLARE_bool(prev_batch_state);
 DECLARE_string(init_model_path);
 DECLARE_bool(use_mkldnn);
+DECLARE_bool(use_mkl_packed);
diff --git a/paddle/utils/enable_virtualenv.py b/paddle/utils/enable_virtualenv.py
index ccfaa7c147b2ce25cb6007aa04cfc33961b7e10b..29f8deb32455a49baad0ab5a2b5fab0a62917ec3 100644
--- a/paddle/utils/enable_virtualenv.py
+++ b/paddle/utils/enable_virtualenv.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import os
 
 
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 2c2cc6245932d4af56a68d6399ce31f008bf3748..1fbdd5bbd82a0ae15f620fa11a14ac3126e52838 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -139,6 +139,8 @@ message PoolConfig {
   optional uint32 output_z = 16 [ default = 1 ];
   optional uint32 img_size_z = 17 [ default = 1 ];
   optional uint32 padding_z = 18 [ default = 1 ];
+
+  optional bool exclude_mode = 19;
 }
 
 message SppConfig {
@@ -540,6 +542,13 @@ message LayerConfig {
 
   // for switch order layer
   optional ReshapeConfig reshape_conf = 59;
+
+  // for batch normalization layer
+  // The small constant added to the variance to improve numeric stability.
+  optional double epsilon = 60 [ default = 0.00001 ];
+
+  // for factorization machine layer
+  optional uint32 factor_size = 61;
 }
 
 message EvaluatorConfig {
diff --git a/proto/OptimizerConfig.proto b/proto/OptimizerConfig.proto
index d27b1bcf80045216a5807812d39f7a248a956076..b341d78d194ddcbab265084db62752bc53e1b709 100644
--- a/proto/OptimizerConfig.proto
+++ b/proto/OptimizerConfig.proto
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 syntax = "proto2";
 
 option optimize_for = LITE_RUNTIME;
diff --git a/python/.gitignore b/python/.gitignore
index cc7d0ece4acaba2a3fa38a89110587fe8dffb992..1ba1d4c9b0301ed920f5303089e75dd3a8e4e3fa 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -2,6 +2,7 @@
 build
 dist
 paddle.egg-info
+paddlepaddle_gpu.egg-info
 .idea
 paddle/proto/*.py
 paddle/proto/*.pyc
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index c8632295a25b160513a8e154bf1a5453c0005031..36919ab00bf6fc1f9aee350074e5532bfdc7d45e 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -29,8 +29,14 @@ if(WITH_MKLML)
 endif()
 
 if(WITH_MKLDNN)
-  list(APPEND MKL_SHARED_LIBS "${MKLDNN_LIB}" "${MKLDNN_LIB}.0")
-  list(APPEND MKL_DEPENDS mkldnn)
+  list(APPEND MKL_SHARED_LIBS "${MKLDNN_SHARED_LIB}")
+  list(APPEND MKL_DEPENDS mkldnn mkldnn_shared_lib)
+endif()
+
+if(WITH_GPU)
+  SET(PACKAGE_NAME "paddlepaddle-gpu")
+else()
+  SET(PACKAGE_NAME "paddlepaddle")
 endif()
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index f662d6826321eb840739382558f76327d27b5847..1030c94e16376c326cb8b32926b8c47625cd38f0 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -11,3 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+try:
+    from version import full_version as __version__
+    from version import commit as __git_commit__
+except ImportError:
+    import sys
+    sys.stderr.write('''Warning with import paddle: you should not 
+     import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
+                     )
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 0f97c279a0d4d22cb434d63dd39bf16cfb7174ed..4fdf4090212e31adcccf6b119c937e70d5cbf995 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1233,7 +1233,7 @@ def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
     bilinear_conf.out_size_y = bilinear.out_size_y
 
 
-def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
+def parse_pool(pool, input_layer_name, pool_conf, ceil_mode, exclude_mode):
     pool_conf.pool_type = pool.pool_type
     config_assert(pool.pool_type in [
         'max-projection', 'avg-projection', 'max-pool-with-mask', 'cudnn-max-pool', 'cudnn-avg-pool'
@@ -1262,6 +1262,8 @@ def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
     pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
                                          pool_conf.padding_y,
                                          pool_conf.stride_y, not ceil_mode)
+    if exclude_mode != None:
+        pool_conf.exclude_mode = exclude_mode
 
 
 def parse_pool3d(pool, input_layer_name, pool_conf, ceil_mode):
@@ -2037,13 +2039,20 @@ class ParameterReluLayer(LayerBase):
     def __init__(self, name, inputs, partial_sum=1, **args):
         super(ParameterReluLayer, self).__init__(
             name, self.layer_type, 0, inputs=inputs, **args)
+
         input_layer = self.get_input_layer(0)
         config_assert(len(self.inputs) == 1, "prelu layer has only one input.")
         config_assert(input_layer.size % partial_sum == 0,
                       "a wrong setting for partial_sum")
+
+        dims = [1, input_layer.size / partial_sum]
         self.set_layer_size(input_layer.size)
         self.config.partial_sum = partial_sum
-        self.create_input_parameter(0, input_layer.size / partial_sum)
+        self.create_input_parameter(0, input_layer.size / partial_sum, dims)
+
+        self.set_layer_height_width(self.get_input_layer(0).height, \
+                                        self.get_input_layer(0).width)
+        self.set_layer_depth(self.get_input_layer(0).depth)
 
 
 @config_layer('conv')
@@ -2280,11 +2289,17 @@ class Conv3DLayer(Conv3DLayerBase):
 class NormLayer(LayerBase):
     def __init__(self, name, inputs, **xargs):
         super(NormLayer, self).__init__(name, 'norm', 0, inputs=inputs, **xargs)
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        use_mkldnn = True if use_mkldnn and self.inputs[
+            0].norm.norm_type == 'cmrnorm-projection' else False
+        self.config.type = 'mkldnn_lrn' if use_mkldnn else self.config.type
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             norm_conf = self.config.inputs[input_index].norm_conf
             parse_norm(self.inputs[input_index].norm, input_layer.name,
                        norm_conf)
+            norm_conf.scale = self.inputs[
+                input_index].norm.scale if use_mkldnn else norm_conf.scale
             self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
                                norm_conf.channels, False)
             if norm_conf.norm_type == "cross-channel-norm":
@@ -2296,7 +2311,8 @@ class NormLayer(LayerBase):
 class PoolLayer(LayerBase):
     layer_type = 'pool'
 
-    def __init__(self, name, inputs, ceil_mode=True, **xargs):
+    def __init__(self, name, inputs, ceil_mode=True, exclude_mode=None,
+                 **xargs):
         use_mkldnn = int(g_command_config_args.get("use_mkldnn", 0))
         if self.layer_type == "mkldnn_pool":
             config_assert(use_mkldnn, "mkldnn_pool only support MKLDNN")
@@ -2307,7 +2323,7 @@ class PoolLayer(LayerBase):
             input_layer = self.get_input_layer(input_index)
             pool_conf = self.config.inputs[input_index].pool_conf
             parse_pool(self.inputs[input_index].pool, input_layer.name,
-                       pool_conf, ceil_mode)
+                       pool_conf, ceil_mode, exclude_mode)
             self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x,
                                pool_conf.channels)
 
@@ -2393,6 +2409,14 @@ class CropLayer(LayerBase):
         image_conf.img_size_y = input_layer.height
         image_conf.channels = input_layer.size / (input_layer.width *
                                                   input_layer.height)
+        # only support for 4-dims inputs and NCHW order
+        if (len(self.config.inputs) == 2):
+            self.set_layer_height_width(
+                self.get_input_layer(1).height, self.get_input_layer(1).width)
+            self.set_layer_size(self.get_input_layer(1).size)
+        else:
+            self.set_layer_height_width(shape[-2], shape[-1])
+            self.set_layer_size(reduce(lambda x, y: x * y, shape[1:]))
 
 
 @config_layer('batch_norm')
@@ -2405,6 +2429,7 @@ class BatchNormLayer(LayerBase):
                  bias=True,
                  img3D=False,
                  use_global_stats=True,
+                 epsilon=1e-5,
                  moving_average_fraction=0.9,
                  batch_norm_type=None,
                  mean_var_names=None,
@@ -2453,6 +2478,9 @@ class BatchNormLayer(LayerBase):
             self.config.use_global_stats = use_global_stats
         if moving_average_fraction is not None:
             self.config.moving_average_fraction = moving_average_fraction
+        if epsilon is not None:
+            assert epsilon >= 1e-5, "epsilon must be no less than 1e-5."
+            self.config.epsilon = epsilon
 
         input_layer = self.get_input_layer(0)
         image_conf = self.config.inputs[0].image_conf
@@ -2787,19 +2815,18 @@ class AddToLayer(LayerBase):
             name, self.layer_type, 0, inputs=inputs, **xargs)
         config_assert(len(inputs) > 0, 'inputs cannot be empty for AddToLayer')
 
-        if len(self.inputs) > 1:
-            for input_index in xrange(len(self.inputs)):
-                assert self.get_input_layer(0).height == self.get_input_layer(
-                    input_index).height
-                assert self.get_input_layer(0).width == self.get_input_layer(
-                    input_index).width
-                assert self.get_input_layer(0).depth == self.get_input_layer(
-                    input_index).depth
+        layer_size = self.get_input_layer(0).size
+        # To reserve heght, width, depth.
+        layer_with_hwc = self.get_input_layer(0)
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            assert layer_size == input_layer.size
+            if input_layer.height and input_layer.height and input_layer.height:
+                layer_with_hwc = input_layer
 
-        self.set_layer_size(self.get_input_layer(0).size)
-        self.set_layer_height_width(self.get_input_layer(0).height, \
-                                        self.get_input_layer(0).width)
-        self.set_layer_depth(self.get_input_layer(0).depth)
+        self.set_layer_size(layer_with_hwc.size)
+        self.set_layer_height_width(layer_with_hwc.height, layer_with_hwc.width)
+        self.set_layer_depth(layer_with_hwc.depth)
         self.create_bias_parameter(bias, self.config.size)
 
 
@@ -3595,8 +3622,13 @@ class ConcatenateLayer2(LayerBase):
 
 @config_layer('recurrent')
 class RecurrentLayer(LayerBase):
+    layer_type = 'recurrent'
+
     def __init__(self, name, inputs, reversed=False, bias=True, **xargs):
-        super(RecurrentLayer, self).__init__(name, 'recurrent', 0, inputs,
+        use_mkl_packed = bool(
+            int(g_command_config_args.get("use_mkl_packed", 0)))
+        self.layer_type = 'mkl_packed_recurrent' if use_mkl_packed else 'recurrent'
+        super(RecurrentLayer, self).__init__(name, self.layer_type, 0, inputs,
                                              **xargs)
         config_assert(len(self.inputs) == 1, 'RecurrentLayer must have 1 input')
         input_layer = self.get_input_layer(0)
@@ -3839,6 +3871,26 @@ class SwitchOrderLayer(LayerBase):
             name, 'switch_order', 0, inputs=inputs, **xargs)
         self.config.reshape_conf.height_axis.extend(reshape['height'])
         self.config.reshape_conf.width_axis.extend(reshape['width'])
+        input_layer = self.get_input_layer(0)
+        if reshape is None:
+            self.set_layer_size(input_layer.size)
+        else:
+            in_h = input_layer.height
+            in_w = input_layer.width
+            out_dims = None
+            if input_layer.has_depth():
+                in_d = input_layer.depth
+                in_c = input_layer.size / in_h / in_w / in_d
+                # batch_size, depth, height, width, channel
+                out_dims = [0, in_d, in_h, in_w, in_c]
+            else:
+                in_c = input_layer.size / in_h / in_w
+                # batch_size, height, width, channel
+                out_dims = [0, in_h, in_w, in_c]
+            # Because (reshape['width'][0] > 0) always be true.
+            # So out_dims[0] won't be used.
+            size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:])
+            self.set_layer_size(size)
 
 
 @config_layer('scale_sub_region')
@@ -3860,6 +3912,21 @@ class ScaleSubRegionLayer(LayerBase):
                            image_conf.channels)
 
 
+@config_layer('factorization_machine')
+class FactorizationMachineLayer(LayerBase):
+    def __init__(self, name, inputs, factor_size, **xargs):
+        super(FactorizationMachineLayer, self).__init__(
+            name, 'factorization_machine', size=1, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'factorization machine layer must have one and only one input.')
+        self.config.factor_size = factor_size
+        input_layer = self.get_input_layer(0)
+        psize = input_layer.size * factor_size
+        dims = [input_layer.size, factor_size]
+        self.create_input_parameter(0, psize, dims)
+
+
 # Deprecated, use a new layer specific class instead
 @config_func
 def Layer(name, type, **xargs):
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index ecba87191045cff6c05014010e60575741238f8d..e6f87ce61b1d16d4f98f111626776aa52c2ec35b 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -58,12 +58,12 @@ def is_compatible_with(x, Type):
 
 class HookAttribute(object):
     """
-    Hook Attribute object. As a member of ParameterAttribute class, the hook is an auxiliary operation that occurs 
+    Hook Attribute object. As a member of ParameterAttribute class, the hook is an auxiliary operation that occurs
     during training process of a layer with parameters, such as img_conv layer, fc layer.
 
-    :param  type: Hook type, currently supported types: 
+    :param  type: Hook type, currently supported types:
                         'pruning' :  user specify a sparsity_ratio before training started, and the
-                            network will prune the parameters based on the sparsity_ratio. 
+                            network will prune the parameters based on the sparsity_ratio.
                             eg: The definition of Hook object can be hk = HookAttribute('pruning', 0.6)
                             The specific usage can be paddle.layer.img_conv(input=img, filter_size=3,
                                                                        num_channels=3, num_filters=64,
@@ -71,10 +71,10 @@ class HookAttribute(object):
                             The pruning details can be found https://arxiv.org/pdf/1506.02626.pdf
     :type type: string
 
-    :param sparsity_ratio: Must be specified if hook type is 'pruning', 
+    :param sparsity_ratio: Must be specified if hook type is 'pruning',
                         it represents the ratio of the zero elements to be set by the Parameter.
     :type sparsity_ratio: float or None
-	
+
     """
 
     def __init__(self, type, sparsity_ratio=None):
@@ -130,10 +130,12 @@ class ParameterAttribute(object):
     :param sparse_update: Enable sparse update for this parameter. It will
                           enable both local and remote sparse update.
     :type sparse_update: bool
+    :param update_hooks: A HookAttribute object.
+    :type update_hooks: HookAttribute
     :param initializer: If not None, it should be a callable object which accepts
                         a parameter name and returns numpy array for the initial
                         value of the parameter
-    :param initializer: callable object
+    :type initializer: callable object
     """
 
     def __init__(self,
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index 57979db4de08989ab583b0ab41589c09789a0921..95797fba8f67bacb421f5c2813ad6332bc53cbc9 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -297,7 +297,7 @@ def auc_evaluator(
 def pnpair_evaluator(
         input,
         label,
-        info,
+        query_id,
         weight=None,
         name=None, ):
     """
@@ -308,16 +308,20 @@ def pnpair_evaluator(
 
     .. code-block:: python
 
-       eval = pnpair_evaluator(input, label, info)
+       eval = pnpair_evaluator(input, label, query_id)
 
     :param input: Input Layer name. The output prediction of network.
     :type input: LayerOutput
     :param label: Label layer name.
     :type label: LayerOutput
-    :param info: Info layer name. (TODO, explaination)
-    :type info: LayerOutput
+    :param query_id: Query_id layer name. Query_id indicates that which query
+     each sample belongs to. Its shape should be
+     the same as output of Label layer.
+    :type query_id: LayerOutput
     :param weight: Weight Layer name. It should be a matrix with size
-                  [sample_num, 1]. (TODO, explaination)
+                  [sample_num, 1] which indicates the weight of each sample.
+                  The default weight of sample is 1 if the weight layer is None.
+                  And the pair weight is the mean of the two samples' weight.
     :type weight: LayerOutput
     :param name: Evaluator name.
     :type name: None|basestring
@@ -326,8 +330,8 @@ def pnpair_evaluator(
         input = [input]
     if label:
         input.append(label)
-    if info:
-        input.append(info)
+    if query_id:
+        input.append(query_id)
     evaluator_base(
         input=input,
         type="pnpair",
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index a76bc97264b6ba806482d7b226d1d170064a6589..eac2cb316835fda0a52ac9895eaa80914d0f1e5b 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -21,7 +21,7 @@ from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
     ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
 from .evaluators import *
 from .poolings import MaxPooling, AvgPooling, MaxWithMaskPooling, BasePoolingType, \
-    CudnnAvgPooling, CudnnMaxPooling
+    CudnnAvgPooling, CudnnAvgInclPadPooling, CudnnMaxPooling
 from .attrs import *
 from .default_decorators import *
 
@@ -148,6 +148,7 @@ __all__ = [
     'resize_layer',
     'sub_seq_layer',
     'scale_sub_region_layer',
+    'factorization_machine',
 ]
 
 
@@ -264,10 +265,12 @@ class LayerType(object):
 
     SCALE_SUB_REGION_LAYER = 'scale_sub_region'
 
+    FACTORIZATION_MACHINE = 'factorization_machine'
+
     @staticmethod
     def is_layer_type(type_name):
         """
-        If type_name is a layer type.
+        Whether type_name is a layer type.
 
         :param type_name: layer type name. Because layer type enumerations are
                           strings.
@@ -438,7 +441,7 @@ def full_matrix_projection(input, size=0, param_attr=None):
        with mixed_layer(size=100) as m:
            m += full_matrix_projection(input=layer)
 
-    2. When used as an independant object like this, you must set the size:
+    2. When used as an independent object like this, you must set the size:
 
     .. code-block:: python
 
@@ -448,11 +451,11 @@ def full_matrix_projection(input, size=0, param_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param size: The parameter size. Means the width of parameter.
+    :param size: The dimension of this layer.
     :type size: int
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: A FullMatrixProjection Object.
+    :return: FullMatrixProjection Object.
     :rtype: FullMatrixProjection
     """
     proj = FullMatrixProjection(
@@ -465,12 +468,12 @@ def full_matrix_projection(input, size=0, param_attr=None):
 def trans_full_matrix_projection(input, size=0, param_attr=None):
     """
     Different from full_matrix_projection, this projection performs matrix
-    multiplication, using transpose of weight.
+    multiplication, using the transpose of weight.
 
     ..  math::
         out.row[i] += in.row[i] * w^\mathrm{T}
 
-    :math:`w^\mathrm{T}` means transpose of weight.
+    :math:`w^\mathrm{T}` means the transpose of weight.
     The simply usage is:
 
     .. code-block:: python
@@ -486,9 +489,9 @@ def trans_full_matrix_projection(input, size=0, param_attr=None):
     :type input: LayerOutput
     :param size: The parameter size. Means the width of parameter.
     :type size: int
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: A TransposedFullMatrixProjection Object.
+    :return: TransposedFullMatrixProjection Object.
     :rtype: TransposedFullMatrixProjection
     """
     proj = TransposedFullMatrixProjection(
@@ -518,7 +521,7 @@ def table_projection(input, size=0, param_attr=None):
        with mixed_layer(size=100) as m:
            m += table_projection(input=layer)
 
-    2. When used as an independant object like this, you must set the size:
+    2. When used as an independent object like this, you must set the size:
 
     .. code-block:: python
 
@@ -529,11 +532,11 @@ def table_projection(input, size=0, param_attr=None):
 
     :param input: The input of this layer, which must contains id fields.
     :type input: LayerOutput
-    :param size: The parameter size. Means the width of parameter.
+    :param size: The dimension of the output.
     :type size: int
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: A TableProjection Object.
+    :return: TableProjection Object.
     :rtype: TableProjection
     """
     proj = TableProjection(
@@ -544,7 +547,7 @@ def table_projection(input, size=0, param_attr=None):
 
 def identity_projection(input, offset=None, size=None):
     """
-    1. IdentityProjection if offset=None. It performs:
+    1. If offset=None, it performs IdentityProjection as follows:
 
     .. math::
        out.row[i] += in.row[i]
@@ -556,9 +559,8 @@ def identity_projection(input, offset=None, size=None):
        proj = identity_projection(input=layer)
 
 
-    2. IdentityOffsetProjection if offset!=None. It likes IdentityProjection,
-    but layer size may be smaller than input size.
-    It select dimesions [offset, offset+layer_size) from input:
+    2. If offset!=None, It executes IdentityOffsetProjection and takes the
+       elements of the input in the range [offset, offset+size) as output.
 
     .. math::
        out.row[i] += in.row[i + \\textrm{offset}]
@@ -570,14 +572,20 @@ def identity_projection(input, offset=None, size=None):
        proj = identity_projection(input=layer,
                                   offset=10)
 
-    Note that both of two projections should not have any parameter.
+    Note that neither of the projections have trainable parameter.
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param offset: Offset, None if use default.
+    :param offset: The offset from the start of the input. The input's
+                   elements in the range [offset, offset+size) will be
+                   taken as output. If this parameter is not set or set
+                   to None, the output will be the same as the input.
     :type offset: int
-    :return: A IdentityProjection or IdentityOffsetProjection object
-    :rtype: IdentityProjection or IdentityOffsetProjection
+    :param size: The dimension of this layer. It will be neglected
+                 when offset is None or not set.
+    :type size: int
+    :return: IdentityProjection or IdentityOffsetProjection object
+    :rtype: IdentityProjection | IdentityOffsetProjection
     """
     if offset is None:
         proj = IdentityProjection(input_layer_name=input.name)
@@ -593,8 +601,8 @@ def identity_projection(input, offset=None, size=None):
 
 def slice_projection(input, slices):
     """
-    slice_projection can slice the input value into multiple parts,
-    and then select some of them to merge into a new output.
+    slice_projection slices the input value into multiple parts,
+    then selects and merges some of them into a new output.
 
     .. math::
        output = [input.slices()]
@@ -605,15 +613,13 @@ def slice_projection(input, slices):
 
        proj = slice_projection(input=layer, slices=[(0, 10), (20, 30)])
 
-    Note that slice_projection should not have any parameter.
+    Note that slice_projection has no trainable parameter.
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param slices: An array of slice parameters.
-                   Each slice contains the start and end offsets based
-                   on the input.
-    :type slices: pair of int
-    :return: A SliceProjection object
+    :param slices: A list of start and end offsets of each slice.
+    :type slices: list of tuple
+    :return: SliceProjection object.
     :rtype: SliceProjection
     """
     assert len(slices) >= 1
@@ -633,8 +639,7 @@ def slice_projection(input, slices):
 @wrap_param_attr_default()
 def scaling_projection(input, param_attr=None):
     """
-    scaling_projection multiplies the input with a scalar parameter and add to
-    the output.
+    scaling_projection multiplies the input with a scalar parameter.
 
     .. math::
        out += w * in
@@ -647,9 +652,9 @@ def scaling_projection(input, param_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: A ScalingProjection object
+    :return: ScalingProjection object.
     :rtype: ScalingProjection
     """
     proj = ScalingProjection(input_layer_name=input.name, **param_attr.attr)
@@ -660,8 +665,8 @@ def scaling_projection(input, param_attr=None):
 @wrap_param_attr_default()
 def dotmul_projection(input, param_attr=None):
     """
-    DotMulProjection with a layer as input.
-    It performs element-wise multiplication with weight.
+    DotMulProjection takes a layer as input and performs
+    element-wise multiplication with weight.
 
     ..  math::
         out.row[i] += in.row[i] .* weight
@@ -676,9 +681,9 @@ def dotmul_projection(input, param_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: A DotMulProjection Object.
+    :return: DotMulProjection object.
     :rtype: DotMulProjection
     """
     proj = DotMulProjection(
@@ -695,7 +700,7 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs):
        out.row[i] += scale * (a.row[i] .* b.row[i])
 
     where :math:`.*` means element-wise multiplication, and
-    scale is a config scalar, its default value is one.
+    scale is a config scalar, its default value is 1.
 
     The example usage is:
 
@@ -703,13 +708,13 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs):
 
        op = dotmul_operator(a=layer1, b=layer2, scale=0.5)
 
-    :param a: Input layer1
+    :param a: The first input of this layer.
     :type a: LayerOutput
-    :param b: Input layer2
+    :param b: The second input of this layer.
     :type b: LayerOutput
-    :param scale: config scalar, default value is one.
+    :param scale: A scalar to scale the product. Its default value is 1.
     :type scale: float
-    :return: A DotMulOperator Object.
+    :return: DotMulOperator object.
     :rtype: DotMulOperator
     """
     if 'x' in kwargs or 'y' in kwargs:
@@ -735,28 +740,29 @@ def context_projection(input,
     """
     Context Projection.
 
-    It just simply reorganizes input sequence, combines "context_len" sequence
-    to one context from context_start. "context_start" will be set to
-    -(context_len - 1) / 2 by default. If context position out of sequence
+    It just reorganizes input sequence, combines "context_len" elements of the
+    sequence to one context from context_start. "context_start" will be set to
+    -(context_len - 1) / 2 by default. When context position is out of sequence
     length, padding will be filled as zero if padding_attr = False, otherwise
     it is trainable.
 
-    For example, origin sequence is [A B C D E F G], context len is 3, then
-    after context projection and not set padding_attr, sequence will
+    For example, origin sequence is [A B C D E F G], context len is 3, padding_attr
+    is not set, then after context projection, sequence will
     be [ 0AB ABC BCD CDE DEF EFG FG0 ].
 
     :param input: The input of this layer, which should be a sequence.
     :type input: LayerOutput
-    :param context_len: context length.
+    :param context_len: The length of the context.
     :type context_len: int
-    :param context_start: context start position. Default is
+    :param context_start: The start position of the context. The default value is
                           -(context_len - 1)/2
     :type context_start: int
-    :param padding_attr: Padding Parameter Attribute. If false, it means padding
-                         always be zero. Otherwise Padding is learnable, and
-                         parameter attribute is set by this parameter.
+    :param padding_attr: Parameter attribute of the padding. If the parameter is
+                         set to False, padding will be zero. In other cases, the
+                         padding is trainable, and its parameter attribute is set
+                         by this parameter.
     :type padding_attr: bool | ParameterAttribute
-    :return: Projection
+    :return: Projection object.
     :rtype: Projection
     """
     context_start = -(
@@ -788,10 +794,9 @@ class MixedLayerType(LayerOutput):
 
     def __init__(self, name, size, act, bias_attr, layer_attr, parents=None):
         """
-        Ctor.
-        :param name: layer name.
+        :param name: The name of this layer.
         :type name: basestring
-        :param size: layer size.
+        :param size: The dimension of this layer.
         :type size: int
         :param act: Activation type.
         :type act: BaseActivation
@@ -799,8 +804,9 @@ class MixedLayerType(LayerOutput):
                           whose type is not ParameterAttribute, no bias is defined. If the
                           parameter is set to True, the bias is initialized to zero.
         :type bias_attr: ParameterAttribute | None | bool | Any
-        :param layer_attr: Extra Layer Attribute.
-        :type layer_attr: ExtraLayerAttribute or None
+        :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                           details.
+        :type layer_attr: ExtraLayerAttribute | None
         """
         LayerOutput.__init__(
             self,
@@ -865,12 +871,12 @@ def mixed_layer(size=0,
                 bias_attr=False,
                 layer_attr=None):
     """
-    Mixed Layer. A mixed layer will add all inputs together, then activate.
-    Each inputs is a projection or operator.
+    Mixed Layer. A mixed layer will add all inputs together, then activate the sum.
+    Each input is a projection or operator.
 
     There are two styles of usages.
 
-    1. When not set inputs parameter, use mixed_layer like this:
+    1. When the parameter input is not set, use mixed_layer like this:
 
     .. code-block:: python
 
@@ -886,21 +892,21 @@ def mixed_layer(size=0,
                        input=[full_matrix_projection(input=layer1),
                               full_matrix_projection(input=layer2)])
 
-    :param name: mixed layer name. Can be referenced by other layer.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param size: layer size.
+    :param size: The dimension of this layer.
     :type size: int
-    :param input: The input of this layer. It is an optional parameter. If set,
-                  then this function will just return layer's name.
+    :param input: The input of this layer. It is an optional parameter.
     :param act: Activation Type. LinearActivation is the default activation.
     :type act: BaseActivation
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer config. Default is None.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :return: MixedLayerType object can add inputs or layer name.
+    :return: MixedLayerType object.
     :rtype: MixedLayerType
     """
 
@@ -935,14 +941,15 @@ def data_layer(name, size, depth=None, height=None, width=None,
 
     :param name: The name of this layer.
     :type name: basestring
-    :param size: Size of this data layer.
+    :param size: The dimension of this data layer.
     :type size: int
-    :param height: Height of this data layer, used for image
+    :param height: The height of the input image data.
     :type height: int | None
-    :param width: Width of this data layer, used for image
+    :param width: The width of the input image data.
     :type width: int | None
-    :param layer_attr: Extra Layer Attribute.
-    :type layer_attr: ExtraLayerAttribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -975,14 +982,15 @@ def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input of this layer, which must be Index Data.
+    :param input: The input of this layer, whose type must be Index Data.
     :type input: LayerOutput
-    :param size: The embedding dimension.
+    :param size: The dimension of the embedding vector.
     :type size: int
     :param param_attr: The embedding parameter attribute. See ParameterAttribute
                       for details.
-    :type param_attr: ParameterAttribute | None
-    :param layer_attr: Extra layer Config. Default is None.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1010,7 +1018,7 @@ def fc_layer(input,
              bias_attr=None,
              layer_attr=None):
     """
-    Helper for declare fully connected layer.
+    The fully connected layer.
 
     The example usage is:
 
@@ -1032,17 +1040,18 @@ def fc_layer(input,
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput | list | tuple
-    :param size: The layer dimension.
+    :param size: The dimension of this layer.
     :type size: int
     :param act: Activation Type. TanhActivation is the default activation.
     :type act: BaseActivation
-    :param param_attr: The Parameter Attribute|list.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: Extra Layer config.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1083,13 +1092,15 @@ def fc_layer(input,
 @wrap_name_default("print")
 def printer_layer(input, format=None, name=None):
     """
-    Print the output value of input layers. This layer is useful for debugging.
+    Print the output value of the layers specified by the parameter input.
+    This layer is useful for debugging.
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput | list | tuple
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     if isinstance(input, LayerOutput):
         input = [input]
@@ -1132,11 +1143,12 @@ def priorbox_layer(input,
     :param aspect_ratio: The aspect ratio.
     :type aspect_ratio: list
     :param variance: The bounding box variance.
-    :type min_size: The min size of the priorbox width/height.
+    :type min_size: The minimum size of the priorbox width/height.
     :param min_size: list
-    :type max_size: The max size of the priorbox width/height. Could be NULL.
+    :type max_size: The maximum size of the priorbox width/height. It could be NULL.
     :param max_size: list
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     # plus one for ratio 1.
     num_filters = (len(aspect_ratio) * 2 + 1 + len(max_size)) * 4
@@ -1174,7 +1186,7 @@ def multibox_loss_layer(input_loc,
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input_loc: The input predict locations.
+    :param input_loc: The input predicted locations.
     :type input_loc: LayerOutput | List of LayerOutput
     :param input_conf: The input priorbox confidence.
     :type input_conf: LayerOutput | List of LayerOutput
@@ -1186,13 +1198,15 @@ def multibox_loss_layer(input_loc,
     :type num_classes: int
     :param overlap_threshold: The threshold of the overlap.
     :type overlap_threshold: float
-    :param neg_pos_ratio: The ratio of the negative bbox to the positive bbox.
+    :param neg_pos_ratio: The ratio of the negative bounding box to
+                          the positive bounding box.
     :type neg_pos_ratio: float
-    :param neg_overlap: The negative bbox overlap threshold.
+    :param neg_overlap: The negative bounding box overlap threshold.
     :type neg_overlap: float
     :param background_id: The background class index.
     :type background_id: int
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     if isinstance(input_loc, LayerOutput):
         input_loc = [input_loc]
@@ -1255,19 +1269,20 @@ def detection_output_layer(input_loc,
     :type input_conf: LayerOutput | List of LayerOutput.
     :param priorbox: The input priorbox location and the variance.
     :type priorbox: LayerOutput
-    :param num_classes: The number of the classification.
+    :param num_classes: The number of the classes.
     :type num_classes: int
     :param nms_threshold: The Non-maximum suppression threshold.
     :type nms_threshold: float
-    :param nms_top_k: The bbox number kept of the NMS's output
+    :param nms_top_k: The bounding boxes number kept of the NMS's output.
     :type nms_top_k: int
-    :param keep_top_k: The bbox number kept of the layer's output
+    :param keep_top_k: The bounding boxes number kept of the layer's output.
     :type keep_top_k: int
-    :param confidence_threshold: The classification confidence threshold
+    :param confidence_threshold: The classification confidence threshold.
     :type confidence_threshold: float
     :param background_id: The background class index.
     :type background_id: int
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     if isinstance(input_loc, LayerOutput):
         input_loc = [input_loc]
@@ -1323,7 +1338,7 @@ def roi_pool_layer(input,
     A layer used by Fast R-CNN to extract feature maps of ROIs from the last
     feature map.
 
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input layer.
     :type input: LayerOutput.
@@ -1335,9 +1350,10 @@ def roi_pool_layer(input,
     :type pooled_height: int
     :param spatial_scale: The spatial scale between the image and feature map.
     :type spatial_scale: float
-    :param num_channels: number of input channel.
+    :param num_channels: The number of the input channels.
     :type num_channels: int
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     if num_channels is None:
         assert input.num_filters is not None
@@ -1358,18 +1374,19 @@ def roi_pool_layer(input,
 @wrap_name_default("cross_channel_norm")
 def cross_channel_norm_layer(input, name=None, param_attr=None):
     """
-    Normalize a layer's output. This layer is necessary for ssd.
-    This layer applys normalize across the channels of each sample to
-    a conv layer's output and scale the output by a group of trainable
-    factors which dimensions equal to the channel's number.
+    Normalize a layer's output. This layer is necessary for ssd. This
+    layer applys normalization across the channels of each sample to
+    a convolutional layer's output and scales the output by a group of
+    trainable factors whose dimensions equal to the channel's number.
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param param_attr: The Parameter Attribute|list.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     assert input.num_filters is not None
     Layer(
@@ -1410,12 +1427,9 @@ def pooling_layer(input,
     Pooling layer for sequence inputs, not used for Image.
 
     If stride > 0, this layer slides a window whose size is determined by stride,
-    and return the pooling value of the window as the output. Thus, a long sequence
-    will be shorten.
-
-    The parameter stride specifies the intervals at which to apply the pooling
-    operation. Note that for sequence with sub-sequence, the default value
-    of stride is -1.
+    and returns the pooling value of the sequence in the window as the output. Thus,
+    a long sequence will be shortened. Note that for sequence with sub-sequence, the
+    default value of stride is -1.
 
     The example usage is:
 
@@ -1432,16 +1446,16 @@ def pooling_layer(input,
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param pooling_type: Type of pooling, MaxPooling(default), AvgPooling,
-                         SumPooling, SquareRootNPooling.
+    :param pooling_type: Type of pooling. MaxPooling is the default pooling.
     :type pooling_type: BasePoolingType | None
     :param stride: The step size between successive pooling regions.
-    :type stride: Int
+    :type stride: int
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The Extra Attributes for layer, such as dropout.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1516,34 +1530,33 @@ def lstmemory(input,
     NOTE: This is a low level user interface. You can use network.simple_lstm
     to config a simple plain lstm layer.
 
-    Please refer to **Generating Sequences With Recurrent Neural Networks** for
-    more details about LSTM.
-
-    Link_ goes as below.
-
-    .. _Link: http://arxiv.org/abs/1308.0850
+    Reference:
+        `Generating Sequences With Recurrent Neural Networks
+        <https://arxiv.org/pdf/1308.0850.pdf>`_
 
-    :param name: The lstmemory layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param size: DEPRECATED. size of the lstm cell
+    :param size: DEPRECATED. The dimension of the lstm cell.
     :type size: int
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param reverse: is sequence process reversed or not.
+    :param reverse: Whether the input sequence is processed in a reverse order.
     :type reverse: bool
     :param act: Activation type. TanhActivation is the default activation.
     :type act: BaseActivation
-    :param gate_act: gate activation type, SigmoidActivation by default.
+    :param gate_act: Activation type of this layer's gates. SigmoidActivation is the
+                     default activation.
     :type gate_act: BaseActivation
-    :param state_act: state activation type, TanhActivation by default.
+    :param state_act: Activation type of the state. TanhActivation is the default activation.
     :type state_act: BaseActivation
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: Parameter Attribute.
-    :type param_attr: ParameterAttribute | None | False
-    :param layer_attr: Extra Layer attribute
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1632,14 +1645,14 @@ def grumemory(input,
         h_t = (1 - z_t) h_{t-1} + z_t {\\tilde{h_t}}
 
     NOTE: In PaddlePaddle's implementation, the multiplication operations
-    :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not computed in
-    gate_recurrent layer. Consequently, an additional mixed_layer with
+    :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not performed
+    in gate_recurrent layer. Consequently, an additional mixed_layer with
     full_matrix_projection or a fc_layer must be included before grumemory
     is called.
 
-    More details can be found by referring to `Empirical Evaluation of Gated
-    Recurrent Neural Networks on Sequence Modeling.
-    <https://arxiv.org/abs/1412.3555>`_
+    Reference:
+        `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling
+        <https://arxiv.org/abs/1412.3555>`_
 
     The simple usage is:
 
@@ -1647,28 +1660,29 @@ def grumemory(input,
 
        gru = grumemory(input)
 
-    :param name: The gru layer name.
-    :type name: None | basestring
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput.
-    :param size: DEPRECATED. size of the gru cell
+    :param size: DEPRECATED. The dimension of the gru cell.
     :type size: int
-    :param reverse: Whether sequence process is reversed or not.
+    :param reverse: Whether the input sequence is processed in a reverse order.
     :type reverse: bool
     :param act: Activation type, TanhActivation is the default. This activation
                 affects the :math:`{\\tilde{h_t}}`.
     :type act: BaseActivation
-    :param gate_act: gate activation type, SigmoidActivation by default.
-                     This activation affects the :math:`z_t` and :math:`r_t`. It is the
-                     :math:`\\sigma` in the above formula.
+    :param gate_act: Activation type of this layer's two gates. SigmoidActivation is
+                     the default activation. This activation affects the :math:`z_t`
+                     and :math:`r_t`. It is the :math:`\\sigma` in the above formula.
     :type gate_act: BaseActivation
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: Parameter Attribute.
-    :type param_attr: ParameterAttribute | None | False
-    :param layer_attr: Extra Layer attribute
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1712,10 +1726,10 @@ def last_seq(input,
     """
     Get Last Timestamp Activation of a sequence.
 
-    If stride > 0, this layer slides a window whose size is determined by stride,
-    and return the last value of the window as the output. Thus, a long sequence
-    will be shorten. Note that for sequence with sub-sequence, the default value
-    of stride is -1.
+    If stride > 0, this layer will slide a window whose size is determined by stride,
+    and return the last value of the sequence in the window as the output. Thus, a
+    long sequence will be shortened. Note that for sequence with sub-sequence, the
+    default value of stride is -1.
 
     The simple usage is:
 
@@ -1724,14 +1738,16 @@ def last_seq(input,
        seq = last_seq(input=layer)
 
     :param agg_level: Aggregated level
+    :type agg_level: AggregateLevel
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
     :param stride: The step size between successive pooling regions.
-    :type stride: Int
-    :param layer_attr: extra layer attributes.
-    :type layer_attr: ExtraLayerAttribute.
+    :type stride: int
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1768,10 +1784,10 @@ def first_seq(input,
     """
     Get First Timestamp Activation of a sequence.
 
-    If stride > 0, this layer slides a window whose size is determined by stride,
-    and return the first value of the window as the output. Thus, a long sequence
-    will be shorten. Note that for sequence with sub-sequence, the default value
-    of stride is -1.
+    If stride > 0, this layer will slide a window whose size is determined by stride,
+    and return the first value of the sequence in the window as the output. Thus, a
+    long sequence will be shortened. Note that for sequence with sub-sequence, the
+    default value of stride is -1.
 
     The simple usage is:
 
@@ -1780,13 +1796,15 @@ def first_seq(input,
        seq = first_seq(input=layer)
 
     :param agg_level: aggregation level
+    :type agg_level: AggregateLevel
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
     :param stride: The step size between successive pooling regions.
-    :type stride: Int
-    :param layer_attr: extra layer attributes.
+    :type stride: int
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1844,8 +1862,8 @@ def expand_layer(input,
                  expand_level=ExpandLevel.FROM_NO_SEQUENCE,
                  layer_attr=None):
     """
-    A layer for "Expand Dense data or (sequence data where the length of each
-    sequence is one) to sequence data."
+    A layer for expanding dense data or (sequence data where the length of each
+    sequence is one) to sequence data.
 
     The example usage is:
 
@@ -1857,7 +1875,9 @@ def expand_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param expand_as: Expand as this layer's sequence info.
+    :param expand_as: Expand the input according to this layer's sequence infomation. And
+                      after the operation, the input expanded will have the same number of
+                      elememts as this layer.
     :type expand_as: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
@@ -1865,9 +1885,10 @@ def expand_layer(input,
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param expand_level: whether input layer is timestep(default) or sequence.
+    :param expand_level: Whether the input layer is a sequence or the element of a sequence.
     :type expand_level: ExpandLevel
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1900,9 +1921,12 @@ def repeat_layer(input,
     A layer for repeating the input for num_repeats times.
 
     If as_row_vector:
+
     .. math::
        y  = [x_1,\cdots, x_n, \cdots, x_1, \cdots, x_n]
+
     If not as_row_vector:
+
     .. math::
        y  = [x_1,\cdots, x_1, \cdots, x_n, \cdots, x_n]
 
@@ -1915,19 +1939,19 @@ def repeat_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param num_repeats: Repeat the input so many times
+    :param num_repeats: The times of repeating the input.
     :type num_repeats: int
     :param name: The name of this layer. It is optional.
-    :param as_row_vector: True for treating input as row vector and repeating
-                          in the column direction.  This is equivalent to apply
-                          concat_layer() with num_repeats same input.
-                          False for treating input as column vector and repeating
-                          in the row direction.
+    :type name: basestring
+    :param as_row_vector: Whether to treat the input as row vectors or not. If
+                          the parameter is set to True, the repeating operation
+                          will be performed in the column direction. Otherwise,
+                          it will be performed in the row direction.
     :type as_row_vector: bool
     :param act: Activation type. IdentityActivation is the default activation.
     :type act: BaseActivation
-    :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1974,13 +1998,14 @@ def seq_reshape_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param reshape_size: the size of reshaped sequence.
+    :param reshape_size: The dimension of the reshaped sequence.
     :type reshape_size: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param act: Activation type. IdentityActivation is the default activation.
     :type act: BaseActivation
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
@@ -2008,7 +2033,7 @@ def seq_reshape_layer(input,
 @layer_support()
 def interpolation_layer(input, weight, name=None, layer_attr=None):
     """
-    This layer is for linear interpolation with two inputs,
+    This layer performs linear interpolation on two inputs,
     which is used in NEURAL TURING MACHINE.
 
     .. math::
@@ -2030,7 +2055,8 @@ def interpolation_layer(input, weight, name=None, layer_attr=None):
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2064,7 +2090,7 @@ def bilinear_interp_layer(input,
                           name=None,
                           layer_attr=None):
     """
-    This layer is to implement bilinear interpolation on conv layer output.
+    This layer implements bilinear interpolation on convolutional layer's output.
 
     Please refer to Wikipedia: https://en.wikipedia.org/wiki/Bilinear_interpolation
 
@@ -2074,18 +2100,19 @@ def bilinear_interp_layer(input,
 
        bilinear = bilinear_interp_layer(input=layer1, out_size_x=64, out_size_y=64)
 
-    :param   input:        A input layer.
-    :type    input:        LayerOutput.
-    :param   out_size_x:   bilinear interpolation output width.
-    :type    out_size_x:   int | None
-    :param   out_size_y:   bilinear interpolation output height.
-    :type    out_size_y:   int | None
-    :param   name:         The layer's name, which cna not be specified.
-    :type    name:         None | basestring
-    :param   layer_attr:   Extra Layer attribute.
-    :type    layer_attr:   ExtraLayerAttribute
+    :param input: The input of this layer.
+    :type input: LayerOutput.
+    :param out_size_x: The width of the output.
+    :type out_size_x: int
+    :param out_size_y: The height of the output.
+    :type out_size_y: int
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
-    :rtype:  LayerOutput
+    :rtype: LayerOutput
     """
     assert input.layer_type == LayerType.CONV_LAYER
     assert isinstance(input.activation, LinearActivation)
@@ -2120,8 +2147,8 @@ def power_layer(input, weight, name=None, layer_attr=None):
     .. math::
        y = x^w
 
-    where :math:`x` is a input vector, :math:`w` is scalar weight,
-    and :math:`y` is a output vector.
+    where :math:`x` is an input vector, :math:`w` is a scalar exponent,
+    and :math:`y` is an output vector.
 
     The example usage is:
 
@@ -2131,11 +2158,12 @@ def power_layer(input, weight, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param weight: Weight layer.
+    :param weight: The exponent of the power.
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2175,11 +2203,12 @@ def scaling_layer(input, weight, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param weight: Weight layer.
+    :param weight: The weight of each sample.
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2217,7 +2246,8 @@ def trans_layer(input, name=None, layer_attr=None):
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2253,11 +2283,14 @@ def rotate_layer(input, height, width, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param height: The height of the sample matrix
+    :param height: The height of the sample matrix.
     :type height: int
+    :param width: The width of the sample matrix.
+    :type width: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2302,15 +2335,15 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param a: input layer a
+    :param a: The first input of this layer.
     :type a: LayerOutput
-    :param b: input layer b
+    :param b: The second input of this layer.
     :type b: LayerOutput
-    :param scale: scale for cosine value. default is 5.
+    :param scale: The scale of the cosine similarity. 1 is the default value.
     :type scale: float
-    :param size: layer size. NOTE size_a * size should equal size_b.
+    :param size: The dimension of this layer. NOTE size_a * size should equal size_b.
     :type size: int
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2395,8 +2428,10 @@ def hsigmoid(input,
     """
     Organize the classes into a binary tree. At each node, a sigmoid function
     is used to calculate the probability of belonging to the right branch.
-    This idea is from "F. Morin, Y. Bengio (AISTATS 05):
-    Hierarchical Probabilistic Neural Network Language Model."
+
+    Reference:
+        `Hierarchical Probabilistic Neural Network Language Model
+        <http://www.gatsby.ucl.ac.uk/aistats/fullpapers/208.pdf>`_
 
     The example usage is:
 
@@ -2407,19 +2442,21 @@ def hsigmoid(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput | list | tuple
-    :param label: Label layer.
+    :param label: The input label.
     :type label: LayerOutput
-    :param num_classes: number of classes.
-    :type num_classes: int | None
+    :param num_classes: The number of classes. And it should be larger than 2. If the parameter
+                        is not set or set to None, its actual value will be automatically set to
+                        the number of labels.
+    :type num_classes: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: Parameter Attribute. None means default parameter.
-    :type param_attr: ParameterAttribute | None
-    :param layer_attr: Extra Layer Attribute.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2505,15 +2542,21 @@ def img_conv_layer(input,
     what-are-deconvolutional-layers/>`_ .
     The num_channel means input image's channel number. It may be 1 or 3 when
     input is raw pixels of image(mono or RGB), or it may be the previous layer's
-    num_filters * num_group.
+    num_filters.
 
     There are several groups of filters in PaddlePaddle implementation.
-    Each group will process some channels of the input. For example, if
-    num_channel = 256, group = 4, num_filter=32, the PaddlePaddle will create
-    32*4 = 128 filters to process the input. The channels will be split into 4
-    pieces. First 256/4 = 64 channels will be processed by first 32 filters. The
-    rest channels will be processed by the rest groups of filters.
-
+    If the groups attribute is greater than 1, for example groups=2,
+    the input will be splitted into 2 parts along the channel axis, and
+    the filters will also be splitted into 2 parts. The first half of the filters 
+    is only connected to the first half of the input channels, while the second 
+    half of the filters is only connected to the second half of the input. After
+    the computation of convolution for each part of input,
+    the output will be obtained by concatenating the two results.
+
+    The details of grouped convolution, please refer to:
+    `ImageNet Classification with Deep Convolutional Neural Networks
+    <http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf>`_
+    
     The example usage is:
 
     ..  code-block:: python
@@ -2538,7 +2581,8 @@ def img_conv_layer(input,
     :param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
                           is not set, it will be set automatically according to filter_size.
     :type filter_size_y: int
-    :param num_filters: Each filter group's number of filter
+    :param num_filters: The number of filters. It is as same as the output image channel.
+    :type num_filters: int
     :param act: Activation type. ReluActivation is the default activation.
     :type act: BaseActivation
     :param groups: The group number. 1 is the default group number.
@@ -2690,7 +2734,8 @@ def img_pool_layer(input,
                    pool_size_y=None,
                    stride_y=None,
                    padding_y=None,
-                   ceil_mode=True):
+                   ceil_mode=True,
+                   exclude_mode=None):
     """
     Image pooling Layer.
 
@@ -2702,15 +2747,17 @@ def img_pool_layer(input,
 
     ..  math::
 
-        w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+        w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride}
+
+        h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
 
     - ceil_mode=False:
 
     ..  math::
 
-        w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+        w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride}
+
+        h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
 
     The example usage is:
 
@@ -2754,10 +2801,15 @@ def img_pool_layer(input,
     :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
                        details.
     :type layer_attr: ExtraLayerAttribute
-    :param ceil_mode: Wether to use the ceil function to calculate output height and width.
+    :param ceil_mode: Whether to use the ceil function to calculate output height and width.
                       True is the default. If it is set to False, the floor function will
                       be used.
     :type ceil_mode: bool
+    :param exclude_mode: Whether to exclude the padding cells when calculating, but only 
+                         work when pool_type is AvgPooling. If None, also exclude the padding 
+                         cells. If use cudnn, use CudnnAvgPooling or CudnnAvgInclPadPooling 
+                         as pool_type to identify the mode.
+    :type exclude_mode: bool
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2771,7 +2823,7 @@ def img_pool_layer(input,
         pool_type.name = 'avg'
 
     assert type(pool_type) in [AvgPooling, MaxPooling, MaxWithMaskPooling, CudnnAvgPooling,
-                               CudnnMaxPooling], \
+                               CudnnMaxPooling, CudnnAvgInclPadPooling], \
         "only (Cudnn)AvgPooling, (Cudnn)MaxPooling, MaxWithMaskPooling are supported"
 
     type_name = pool_type.name + '-projection' \
@@ -2800,6 +2852,7 @@ def img_pool_layer(input,
                     padding_y=padding_y))
         ],
         ceil_mode=ceil_mode,
+        exclude_mode=exclude_mode,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
         name,
@@ -2837,17 +2890,21 @@ def img_pool3d_layer(input,
 
     ..  math::
 
-        w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
-        d = 1 + int(ceil(input\_depth + 2 * padding\_z - pool\_size\_z) / float(stride\_z))
+        w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride}
+
+        h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
+
+        d & = 1 + \\frac{ceil(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z}
 
     - ceil_mode=False:
 
     ..  math::
 
-        w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
-        d = 1 + int(floor(input\_depth + 2 * padding\_z - pool\_size\_z) / float(stride\_z))
+        w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride}
+
+        h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
+
+        d & = 1 + \\frac{floor(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z}
 
     The example usage is:
 
@@ -2969,8 +3026,8 @@ def spp_layer(input,
     A layer performs spatial pyramid pooling.
 
     Reference:
-        Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
-        https://arxiv.org/abs/1406.4729
+        `Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
+        <https://arxiv.org/abs/1406.4729>`_
 
     The example usage is:
 
@@ -3071,8 +3128,8 @@ def img_cmrnorm_layer(input,
     Response normalization across feature maps.
 
     Reference:
-        ImageNet Classification with Deep Convolutional Neural Networks
-        http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf
+        `ImageNet Classification with Deep Convolutional Neural Networks
+        <http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf>`_
 
     The example usage is:
 
@@ -3118,6 +3175,7 @@ def batch_norm_layer(input,
                      param_attr=None,
                      layer_attr=None,
                      batch_norm_type=None,
+                     epsilon=1e-5,
                      moving_average_fraction=0.9,
                      use_global_stats=None,
                      mean_var_names=None):
@@ -3137,9 +3195,9 @@ def batch_norm_layer(input,
         y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
 
     Reference:
-        Batch Normalization: Accelerating Deep Network Training by Reducing
+        `Batch Normalization: Accelerating Deep Network Training by Reducing
         Internal Covariate Shift
-        http://arxiv.org/abs/1502.03167
+        <http://arxiv.org/abs/1502.03167>`_
 
     The example usage is:
 
@@ -3188,6 +3246,8 @@ def batch_norm_layer(input,
                              will use the mean and variance of the current batch
                              of test data.
     :type use_global_stats: bool | None.
+    :param epsilon: The small constant added to the variance to improve numeric stability.
+    :type epsilon: float.
     :param moving_average_fraction: Factor used in the moving average computation.
                                    :math:`runningMean = newMean*(1-factor) + runningMean*factor`
     :type moving_average_fraction: float.
@@ -3205,6 +3265,7 @@ def batch_norm_layer(input,
     assert (batch_norm_type is None) or (batch_norm_type == "batch_norm") or \
            (batch_norm_type == "mkldnn_batch_norm") or \
            (batch_norm_type == "cudnn_batch_norm")
+
     l = Layer(
         name=name,
         img3D=img3D,
@@ -3214,6 +3275,7 @@ def batch_norm_layer(input,
         type=LayerType.BATCH_NORM_LAYER,
         batch_norm_type=batch_norm_type,
         bias=ParamAttr.to_bias(bias_attr),
+        epsilon=epsilon,
         moving_average_fraction=moving_average_fraction,
         use_global_stats=use_global_stats,
         mean_var_names=mean_var_names,
@@ -3273,7 +3335,7 @@ def row_l2_norm_layer(input, name=None, layer_attr=None):
     A layer for L2-normalization in each row.
 
     .. math::
-       out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
+       out[i] = \\frac{in[i]} {\\sqrt{\\sum_{k=1}^N in[k]^{2}}}
 
     where the size of :math:`in` is (batchSize x dataDim) ,
     and the size of :math:`out` is a (batchSize x dataDim) .
@@ -4236,7 +4298,7 @@ def dot_prod_layer(input1, input2, name=None, layer_attr=None):
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input1: The first input layer.
-    :type input: LayerOutput
+    :type input1: LayerOutput
     :param input2: The second input layer.
     :type input2: LayerOutput
     :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
@@ -5392,18 +5454,28 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
     to be devided by groups.
 
     Reference:
-        Maxout Networks
-        http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
-        Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
-        https://arxiv.org/pdf/1312.6082v4.pdf
+        `Maxout Networks
+        <http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf>`_
+        `Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
+        <https://arxiv.org/pdf/1312.6082v4.pdf>`_
+
 
     .. math::
-       y_{si+j} = \max_k x_{gsi + sk + j}
-       g = groups
-       s = input.size / num_channels
-       0 \le i < num_channels / groups
-       0 \le j < s
-       0 \le k < groups
+
+       & out = \max_k (in[n, k, o_c , s])
+
+       & out_{i * s + j} = \max_k in_{  k * o_{c} * s + i * s + j}
+
+       & s = \\frac{input.size}{ num\_channels}
+
+       & o_{c} = \\frac{num\_channels}{groups}
+
+       & 0 \le i < o_{c}
+
+       & 0 \le j < s
+
+       & 0 \le k < groups
+
 
     The simple usage is:
 
@@ -5460,9 +5532,9 @@ def ctc_layer(input,
     alignment between the inputs and the target labels is unknown.
 
     Reference:
-        Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
+        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
         with Recurrent Neural Networks
-        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
+        <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf>`_
 
     Note:
         Considering the 'blank' label needed by CTC, you need to use (num_classes + 1)
@@ -5534,9 +5606,9 @@ def warp_ctc_layer(input,
     install it to :code:`third_party/install/warpctc` directory.
 
     Reference:
-        Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
+        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
         with Recurrent Neural Networks
-        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
+        <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf>`_
 
     Note:
         - Let num_classes represents the category number. Considering the 'blank'
@@ -5756,8 +5828,8 @@ def nce_layer(input,
     Noise-contrastive estimation.
 
     Reference:
-        A fast and simple algorithm for training neural probabilistic language
-        models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf
+        `A fast and simple algorithm for training neural probabilistic language
+        models. <https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf>`_
 
     The example usage is:
 
@@ -5872,8 +5944,8 @@ def rank_cost(left,
     A cost Layer for learning to rank using gradient descent.
 
     Reference:
-        Learning to Rank using Gradient Descent
-        http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf
+        `Learning to Rank using Gradient Descent
+        <http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf>`_
 
     .. math::
 
@@ -6140,9 +6212,11 @@ def huber_regression_cost(input,
     Given a prediction f(x), a label y and :math:`\delta`, the loss function
     is defined as:
 
-    .. math:
-       loss = 0.5*\left ( y-f(x) \right )^2, \left | y-f(x) \right |\leq \delta
-       loss = \delta \left | y-f(x) \right |-0.5\delta ^2, otherwise
+    .. math::
+
+       loss = 0.5*(y-f(x))^{2}, | y-f(x) | < \delta
+
+       loss = \delta | y-f(x) | - 0.5 \delta ^2, otherwise
 
     The example usage is:
 
@@ -6189,12 +6263,14 @@ def huber_classification_cost(input,
     """
     For classification purposes, a variant of the Huber loss called modified Huber
     is sometimes used. Given a prediction f(x) (a real-valued classifier score) and
-    a true binary class label :math:`y\in \left \{-1, 1 \right \}`, the modified Huber
+    a true binary class label :math:`y\in \{-1, 1 \}`, the modified Huber
     loss is defined as:
 
     .. math:
-       loss = \max \left ( 0, 1-yf(x) \right )^2, yf(x)\geq 1
-       loss = -4yf(x), \text{otherwise}
+
+       loss = \max ( 0, 1-yf(x) )^2, yf(x) \geq -1
+
+       loss = -4yf(x), otherwise
 
     The example usage is:
 
@@ -6408,8 +6484,8 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
         smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if}  \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases}
 
     Reference:
-        Fast R-CNN
-        https://arxiv.org/pdf/1504.08083v2.pdf
+        `Fast R-CNN
+        <https://arxiv.org/pdf/1504.08083v2.pdf>`_
 
     The example usage is:
 
@@ -6560,7 +6636,7 @@ def row_conv_layer(input,
     .. math::
 
         r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}}
-                  \quad \text{for} \quad  (1 \leq i \leq d)
+                  \quad \\text{for} \quad  (1 \leq i \leq d)
 
     Note:
         The `context_len` is `k + 1`. That is to say, the lookahead step
@@ -6604,18 +6680,19 @@ def row_conv_layer(input,
 
 @layer_support()
 @wrap_name_default()
-@wrap_param_attr_default()
 def prelu_layer(input,
                 name=None,
                 partial_sum=1,
+                channel_shared=None,
+                num_channels=None,
                 param_attr=None,
                 layer_attr=None):
     """
     The Parametric Relu activation that actives outputs with a learnable weight.
 
     Reference:
-        Delving Deep into Rectifiers: Surpassing Human-Level Performance on
-        ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf
+        `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+        ImageNet Classification <http://arxiv.org/pdf/1502.01852v1.pdf>`_
 
     .. math::
        z_i &\\quad if \\quad z_i > 0 \\\\
@@ -6638,6 +6715,14 @@ def prelu_layer(input,
         - partial_sum = number of outputs, indicates all elements share the same weight.
 
     :type partial_sum: int
+    :param channel_shared: whether or not the parameter are shared across channels.
+
+        - channel_shared = True, we set the partial_sum to the number of outputs.
+        - channel_shared = False, we set the partial_sum to the number of elements in one channel.
+
+    :type channel_shared: bool
+    :param num_channels: number of input channel.
+    :type num_channels: int
     :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
     :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
@@ -6648,7 +6733,25 @@ def prelu_layer(input,
     """
 
     assert isinstance(input, LayerOutput), 'prelu_layer accepts only one input.'
-    assert isinstance(param_attr, ParameterAttribute)
+
+    if not param_attr:
+        param_attr = ParamAttr(initial_mean=0.25, initial_std=0.0)
+    else:
+        assert isinstance(param_attr, ParameterAttribute)
+
+    if num_channels is None:
+        assert input.num_filters is not None, \
+                'the input channel cannot be detected, please specify the num_channels parameter'
+        num_channels = input.num_filters
+
+    if channel_shared is not None:
+        assert isinstance(channel_shared, bool)
+        assert (input.height != 0 and input.width != 0), \
+            'input height and widht must be setted'
+        if channel_shared:
+            partial_sum = input.height * input.width * num_channels
+        else:
+            partial_sum = input.height * input.width
 
     l = Layer(
         name=name,
@@ -6660,6 +6763,7 @@ def prelu_layer(input,
         name=name,
         layer_type=LayerType.PRELU,
         parents=input,
+        num_filters=num_channels,
         size=l.config.size)
 
 
@@ -6681,11 +6785,11 @@ def gated_unit_layer(input,
     The gated unit layer implements a simple gating mechanism over the input.
     The input :math:`X` is first projected into a new space :math:`X'`, and
     it is also used to produce a gate weight :math:`\sigma`. Element-wise
-    product between :match:`X'` and :math:`\sigma` is finally returned.
+    product between :math:`X'` and :math:`\sigma` is finally returned.
 
     Reference:
-        Language Modeling with Gated Convolutional Networks
-        https://arxiv.org/abs/1612.08083
+        `Language Modeling with Gated Convolutional Networks
+        <https://arxiv.org/abs/1612.08083>`_
 
     .. math::
        y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
@@ -6821,6 +6925,7 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
 
     :param input: The input of this layer. If two inputs are given, the second one
                   will be regarded as the reference.
+                  And the input must be 4-dims and in NCHW order.
     :type input: LayerOutput | Sequence
     :param offset: The crop offset.
     :type offset: Sequence
@@ -6910,7 +7015,7 @@ def clip_layer(input, min, max, name=None):
 
     .. math::
 
-        out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right)
+        out[i] = \min (\max (in[i],p_{1} ),p_{2} )
 
     .. code-block:: python
 
@@ -7079,7 +7184,7 @@ def img_conv3d_layer(input,
     :param filter_size: The dimensions of the filter kernel along three axises. If the parameter
                         is set to one integer, the three dimensions will be same.
     :type filter_size: int | tuple | list
-    :param num_filters: The number of filters in each group.
+    :param num_filters: The number of filters. It is as same as the output image channel.
     :type num_filters: int
     :param act: Activation type. ReluActivation is the default activation.
     :type act: BaseActivation
@@ -7354,3 +7459,73 @@ def scale_sub_region_layer(input, indices, value, name=None):
         parents=[input, indices],
         num_filters=input.num_filters,
         size=input.size)
+
+
+@wrap_name_default()
+@wrap_act_default(act=LinearActivation())
+@wrap_param_attr_default()
+@layer_support()
+def factorization_machine(input,
+                          factor_size,
+                          act=None,
+                          name=None,
+                          param_attr=None,
+                          layer_attr=None):
+    """
+    The Factorization Machine models pairwise feature interactions as inner
+    product of the learned latent vectors corresponding to each input feature.
+    The Factorization Machine can effectively capture feature interactions
+    especially when the input is sparse.
+
+    This implementation only consider the 2-order feature interactions using
+    Factorization Machine with the formula:
+
+    .. math::
+        y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \\rangle x_i x_j
+
+    Note:
+        X is the input vector with size n. V is the factor matrix. Each row of V
+        is the latent vector corresponding to each input dimesion. The size of
+        each latent vector is k.
+
+    For details of Factorization Machine, please refer to the paper:
+    Factorization machines.
+
+    .. code-block:: python
+        first_order = paddle.layer.fc(input=input,
+                                      size=1,
+                                      act=paddle.activation.Linear())
+        second_order = paddle.layer.factorization_machine(input=input,
+                                                          factor_size=10)
+        fm = paddle.layer.addto(input=[first_order, second_order],
+                                act=paddle.activation.Linear(),
+                                bias_attr=False)
+
+    :param input: The input layer. Supported input types: all input data types
+                  on CPU, and only dense input types on GPU.
+    :type input: LayerOutput
+    :param factor_size: The hyperparameter that defines the dimensionality of
+                        the latent vector size.
+    :type context_len: int
+    :param act: Activation Type. Default is linear activation.
+    :type act: BaseActivation
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert factor_size > 0, "the factor_size must be greater than 0."
+
+    Layer(
+        inputs=[Input(input.name, **param_attr.attr)],
+        name=name,
+        factor_size=factor_size,
+        type=LayerType.FACTORIZATION_MACHINE,
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.FACTORIZATION_MACHINE, input, activation=act, size=1)
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 9776ae18057d57dd994fac8b62090258252922c6..b5cde7bac779ee1d54395b68941df2693e1ed0f5 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -25,10 +25,10 @@ from paddle.trainer.config_parser import *
 __all__ = [
     'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
     "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
-    'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru',
-    'simple_attention', 'dot_product_attention', 'multi_head_attention',
-    'simple_gru2', 'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm',
-    'inputs', 'outputs'
+    'img_conv_group', 'img_separable_conv', 'vgg_16_network', 'gru_unit',
+    'gru_group', 'simple_gru', 'simple_attention', 'dot_product_attention',
+    'multi_head_attention', 'simple_gru2', 'bidirectional_gru',
+    'text_conv_pool', 'bidirectional_lstm', 'inputs', 'outputs'
 ]
 
 ######################################################
@@ -251,13 +251,13 @@ def img_conv_bn_pool(input,
                      pool_layer_attr=None):
     """
     Convolution, batch normalization, pooling group.
-    
+
     Img input => Conv => BN => Pooling => Output.
 
     :param name: group name.
     :type name: basestring
     :param input: input layer.
-    :type input: LayerOutput 
+    :type input: LayerOutput
     :param filter_size: see img_conv_layer for details.
     :type filter_size: int
     :param num_filters: see img_conv_layer for details.
@@ -435,6 +435,85 @@ def img_conv_group(input,
         input=tmp, stride=pool_stride, pool_size=pool_size, pool_type=pool_type)
 
 
+@wrap_name_default("separable_conv")
+def img_separable_conv(input,
+                       num_channels,
+                       num_out_channels,
+                       filter_size,
+                       stride=1,
+                       padding=0,
+                       depth_multiplier=1,
+                       act=None,
+                       bias_attr=None,
+                       param_attr=None,
+                       shared_bias=True,
+                       layer_type='exconv',
+                       name=None):
+    """
+    Separable Convolution.
+
+    The separable convolution module is consisted of a depthwise convolution
+    that acts separately on input channels, followed by a pointwise convolution
+    with 1*1 kernels that mixes channels. It is used for Xception:
+    https://arxiv.org/pdf/1610.02357.pdf
+
+    :param input: input layer.
+    :type input: LayerOutput
+    :param num_channels: the number of input channels.
+    :type num_channels: int
+    :param num_out_channels: the number of output channels.
+    :type num_out_channels: int
+    :param filter_size: the filter size for the depthwise convolution.
+    :type filter_size: int|tuple
+    :param stride: the stride size for the depthwise convolution.
+    :type stride: int|tuple
+    :param padding: the padding size for the depthwise convolution.
+    :type padding: int|tuple
+    :param depth_multiplier: the number of filter for one channel in the
+                             depthwize convolution.
+    :type depth_multiplier: int
+    :param act: the activation function for the output.
+    :type act: BaseActivation
+    :param bias_attr: see img_conv_layer for details.
+    :type bias_attr: ParameterAttribute
+    :param param_attr: see img_conv_layer for details.
+    :type param_attr: ParameterAttribute
+    :param shared_bias: see img_conv_layer for details.
+    :type shared_bias: bool
+    :param layer_type: see img_conv_layer for details.
+    :type layer_type: bool
+    :return: layer's output
+    :rtype: LayerOutput
+    """
+    __depthwise_conv__ = img_conv_layer(
+        name="%s_depthwise_conv" % name,
+        input=input,
+        num_channels=num_channels,
+        num_filters=num_channels * depth_multiplier,
+        groups=num_channels,
+        filter_size=filter_size,
+        stride=stride,
+        padding=padding,
+        act=LinearActivation(),
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        shared_biases=shared_bias,
+        layer_type=layer_type)
+    __pointwise_conv__ = img_conv_layer(
+        name="%s_pointwise_conv" % name,
+        input=__depthwise_conv__,
+        num_channels=num_channels * depth_multiplier,
+        num_filters=num_out_channels,
+        filter_size=1,
+        stride=1,
+        padding=0,
+        act=act,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        shared_biases=shared_bias)
+    return __pointwise_conv__
+
+
 def small_vgg(input_image, num_channels, num_classes):
     def __vgg__(ipt, num_filter, times, dropouts, num_channels_=None):
         return img_conv_group(
@@ -648,7 +727,7 @@ def lstmemory_unit(input,
                    lstm_bias_attr=None,
                    lstm_layer_attr=None):
     """
-    lstmemory_unit defines the caculation process of a LSTM unit during a 
+    lstmemory_unit defines the caculation process of a LSTM unit during a
     single time step. This function is not a recurrent layer, so it can not be
     directly used to process sequence input. This function is always used in
     recurrent_group (see layers.py for more details) to implement attention
@@ -869,7 +948,7 @@ def gru_unit(input,
              gru_layer_attr=None,
              naive=False):
     """
-    gru_unit defines the calculation process of a gated recurrent unit during a single 
+    gru_unit defines the calculation process of a gated recurrent unit during a single
     time step. This function is not a recurrent layer, so it can not be
     directly used to process sequence input. This function is always used in
     the recurrent_group (see layers.py for more details) to implement attention
@@ -1012,7 +1091,7 @@ def simple_gru(input,
     simple_gru in network.py. The reason why there are so many interfaces is
     that we have two ways to implement recurrent neural network. One way is to
     use one complete layer to implement rnn (including simple rnn, gru and lstm)
-    with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But 
+    with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But
     the multiplication operation :math:`W x_t` is not computed in these layers.
     See details in their interfaces in layers.py.
     The other implementation is to use an recurrent group which can ensemble a
@@ -1116,11 +1195,12 @@ def simple_gru2(input,
     :type act: BaseActivation
     :param gate_act: gate activiation type of gru
     :type gate_act: BaseActivation
-    :param gru_bias_attr: bias parameter attribute of gru layer, 
+    :param gru_bias_attr: bias parameter attribute of gru layer,
                           False means no bias, None means default bias.
     :type gru_bias_attr: ParameterAttribute|False|None
-    :param gru_layer_attr: Extra attribute of the gru layer.
-    :type gru_layer_attr: ExtraLayerAttribute
+    :param gru_param_attr: param parameter attribute of gru layer,
+                          None means default param.
+    :type gru_param_attr: ParameterAttribute|None
     :return: the gru group.
     :rtype: LayerOutput
     """
@@ -1188,7 +1268,7 @@ def bidirectional_gru(input,
     :type size: int
     :param return_seq: If set False, the last time step of output are
                        concatenated and returned.
-                       If set True, the entire output sequences in forward 
+                       If set True, the entire output sequences in forward
                        and backward directions are concatenated and returned.
     :type return_seq: bool
     :return: LayerOutput object.
@@ -1277,7 +1357,7 @@ def bidirectional_lstm(input,
     :type size: int
     :param return_seq: If set False, the last time step of output are
                        concatenated and returned.
-                       If set True, the entire output sequences in forward 
+                       If set True, the entire output sequences in forward
                        and backward directions are concatenated and returned.
     :type return_seq: bool
     :return: LayerOutput object.
diff --git a/python/paddle/trainer_config_helpers/poolings.py b/python/paddle/trainer_config_helpers/poolings.py
index f45616551bcd4822c668234c3afaf6aa35cd2953..e0aeb311b3ae842aee337dbbf869e2f947d22bd9 100644
--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
@@ -16,7 +16,8 @@
 
 __all__ = [
     "BasePoolingType", "MaxPooling", "AvgPooling", "MaxWithMaskPooling",
-    "CudnnMaxPooling", "CudnnAvgPooling", "SumPooling", "SquareRootNPooling"
+    "CudnnMaxPooling", "CudnnAvgPooling", "CudnnAvgInclPadPooling",
+    "SumPooling", "SquareRootNPooling"
 ]
 
 
@@ -88,6 +89,16 @@ class CudnnAvgPooling(BasePoolingType):
         BasePoolingType.__init__(self, "cudnn-avg-pool")
 
 
+class CudnnAvgInclPadPooling(BasePoolingType):
+    """
+    Cudnn average pooling only support GPU. Return the average value in the
+    pooling window taking into account the padding cells.
+    """
+
+    def __init__(self):
+        BasePoolingType.__init__(self, "cudnn-avg-incl-pad-pool")
+
+
 class AvgPooling(BasePoolingType):
     """
     Average pooling.
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index a21f67a2d99e7eab39708e2a571d30d7e9f20ce6..10c941f707498ec45e79bed9d3f8054eea19887d 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -11,6 +11,7 @@ test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_l
 test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
 test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer
 test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer
-test_scale_sub_region_layer test_dot_prod_layer test_l2_distance_layer)
+test_scale_sub_region_layer test_dot_prod_layer test_l2_distance_layer
+test_factorization_machine)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
index 01d31ef3fad827bfd103ee00f4ddd1bde14e0f82..c944a96042a96401b0309e123077041724f60246 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-3, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py b/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py
index 91849b40a0801b07642f96c061755597cd2ec073..27b11ffdfc7c7f8171c96997cf7fb50fe6b7667d 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-3, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
index f87237f9b59a833825841bcdd605c2332c2d5941..6a900518272d380fa4827161fdf626e5fa2e8fae 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py b/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
index 7012dbf6a0b70957d6227d4125f4cd75b9abb215..06115d62e775d843eeca9d11e1def910399fec19 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 '''
 Test all activations.
 '''
diff --git a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
index a607a62c99f69ac4921a465a20f00b6413b31c8e..f5e90fdd89912d103e7404e10e1a3e8762d7c43f 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/projections.py b/python/paddle/trainer_config_helpers/tests/configs/projections.py
index dc8975cb311582a621eb4a5a166ddc34348fe3e9..c683d378caad278db0d0bcabdd4c7f2c5c62ace7 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/projections.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/projections.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 '''
 Test mixed layer, projections and operators.
 '''
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
index b14121e82cb7d9516c4771fc896b9b3b9e01d1c8..3e0f957648879d4350d662b336c953273bac1378 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
@@ -65,6 +65,7 @@ layers {
   height: 227
   width: 227
   depth: 1
+  epsilon: 1e-05
 }
 layers {
   name: "__crmnorm_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
index c7a487a11231cba6182b654108773037bdb0ec35..a18a4652e14c0cfc4dbca87e67d31aa663ee756b 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -65,6 +65,7 @@ layers {
   height: 256
   width: 256
   depth: 1
+  epsilon: 1e-05
 }
 layers {
   name: "__crmnorm_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
index 832ed24a31dd2bedba9a4fce77d7a088d1796fdb..9b69ae4a3b3cbcc7c0c69a2d5b3728e2f0204f33 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
@@ -36,6 +36,7 @@ layers {
   height: 6
   width: 20
   depth: 3
+  epsilon: 1e-05
 }
 parameters {
   name: "___batch_norm_0__.w0"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..4f3002b19942ed58970bfd64e5978c1601273992
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
@@ -0,0 +1,39 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 1024
+  active_type: ""
+}
+layers {
+  name: "__factorization_machine_0__"
+  type: "factorization_machine"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___factorization_machine_0__.w0"
+  }
+  factor_size: 10
+}
+parameters {
+  name: "___factorization_machine_0__.w0"
+  size: 10240
+  initial_mean: 0.0
+  initial_std: 0.03125
+  dims: 1024
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data"
+output_layer_names: "__factorization_machine_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__factorization_machine_0__"
+  input_layer_names: "data"
+  output_layer_names: "__factorization_machine_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
index 94ad56cab063df9e6a11bb1c293727fb9dec810f..63fb38c6508675d379f577b965ea17ad4c3b4942 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
@@ -4,6 +4,8 @@ layers {
   type: "data"
   size: 300
   active_type: ""
+  height: 10
+  width: 10
 }
 layers {
   name: "__prelu_layer_0__"
@@ -15,6 +17,9 @@ layers {
     input_parameter_name: "___prelu_layer_0__.w0"
   }
   partial_sum: 1
+  height: 10
+  width: 10
+  depth: 1
 }
 layers {
   name: "__prelu_layer_1__"
@@ -26,6 +31,9 @@ layers {
     input_parameter_name: "___prelu_layer_1__.w0"
   }
   partial_sum: 1
+  height: 10
+  width: 10
+  depth: 1
 }
 layers {
   name: "__prelu_layer_2__"
@@ -37,41 +45,100 @@ layers {
     input_parameter_name: "___prelu_layer_2__.w0"
   }
   partial_sum: 5
+  height: 10
+  width: 10
+  depth: 1
+}
+layers {
+  name: "__prelu_layer_3__"
+  type: "prelu"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___prelu_layer_3__.w0"
+  }
+  partial_sum: 300
+  height: 10
+  width: 10
+  depth: 1
+}
+layers {
+  name: "__prelu_layer_4__"
+  type: "prelu"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___prelu_layer_4__.w0"
+  }
+  partial_sum: 100
+  height: 10
+  width: 10
+  depth: 1
 }
 parameters {
   name: "___prelu_layer_0__.w0"
   size: 300
-  initial_mean: 0.0
-  initial_std: 0.057735026919
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 300
   initial_strategy: 0
-  initial_smart: true
+  initial_smart: false
 }
 parameters {
   name: "___prelu_layer_1__.w0"
   size: 300
-  initial_mean: 0.0
-  initial_std: 0.057735026919
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 300
   initial_strategy: 0
-  initial_smart: true
+  initial_smart: false
 }
 parameters {
   name: "___prelu_layer_2__.w0"
   size: 60
-  initial_mean: 0.0
-  initial_std: 0.129099444874
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 60
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___prelu_layer_3__.w0"
+  size: 1
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___prelu_layer_4__.w0"
+  size: 3
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 3
   initial_strategy: 0
-  initial_smart: true
+  initial_smart: false
 }
 input_layer_names: "input"
-output_layer_names: "__prelu_layer_2__"
+output_layer_names: "__prelu_layer_4__"
 sub_models {
   name: "root"
   layer_names: "input"
   layer_names: "__prelu_layer_0__"
   layer_names: "__prelu_layer_1__"
   layer_names: "__prelu_layer_2__"
+  layer_names: "__prelu_layer_3__"
+  layer_names: "__prelu_layer_4__"
   input_layer_names: "input"
-  output_layer_names: "__prelu_layer_2__"
+  output_layer_names: "__prelu_layer_4__"
   is_recurrent_layer_group: false
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py b/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
index 7c848ef3fcd63314bfe91db6ebac406ba8758998..bf90d1762c89e96cfdd60ee9729cf33d2c7625c1 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py b/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
index c19bb9685aa24c4d66e4f0bbbcb004507413dbe8..7cfab838552ab01c55bd8a9cc8eb98f79e81cc68 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
index 565e281a6e1deff18aa48f97eb2f0e39ca79752f..8a425c7062ef5e09c1da7d9d1e26481276eafd95 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py b/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
index a5b5bb30b1d21aaa0c90868af7b5138e8a81aab1..8ee213a493596c24b7264eb8e2e34f87c2502006 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-4)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py b/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py
index a991b22252ba10eed895efd931108c2d8b0e52f1..cbd3c3e97f8c31fc0d0eb6778fbbc4a1f09b7358 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-4)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py b/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
index cd7f609638e384314177d653e46ecf7a4b41a12f..bed9154fe3ed634bb65e994145c1b06c4ee88aa4 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-4)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
index be83f4f83c5d05ea2ffd9e3df0c09fb1a37a3e57..7e1da753f535e9c7c908f24250a4bfa932b3991d 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
index f066fe1fb30877bf40bb6299d35546f7427989a5..0a719b073540dc3f79b827d1f9fa1a20fc23c39a 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 data = data_layer(name='input', size=300)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py b/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py
index 9b791a0222dd60e9ae2fca8b2798cddd13ed1d1c..7003872700bca22ccf9653172e3bcc6411f90425 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/env python
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
index aa0a2c0d5fe19b6c414acd708bb6e82d9fb6568f..fb2cacd4433f085241bfdc00c27c8c41f992daa9 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
index 7ce375c708af7b0b7ae1d700dedbdb6a4ce16c7f..a8b5c860ef5abf2c12f8d93c52fba24a9df818e7 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
index caa6aaa9430ffaee7ade93ee04ec90103bf8cf43..eba2e1e4834813be6e5771af0c61c0b54889eb45 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_crop.py b/python/paddle/trainer_config_helpers/tests/configs/test_crop.py
index 8314a7e9a5586647c70ff010156817110919c72b..870388faf74bed7e749b944a5bc44304cc098f9e 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_crop.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_crop.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
index 4a5bdf1181dc4538418a8b89b41a1ff713e423c8..253244dcd42a04b3212f4400ddd1b5320ce3a229 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/env python
 #coding=utf-8
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
index a113279fc17b49ad01b8860b61180af0f35694fb..db950093b37a0ec0fa6f4e4fd0dcb5d8270c220b 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
index 3572a2cb07d95ffaec261bdc63492ade734ea8b9..d304a2985916b9a9508706b712b86258b1236a77 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
index e52d48dde0084aacd3f7874cc384d59287a0c7d5..2e5dde2da231908462bad73700c6ac5b8bfc233e 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 vec1 = data_layer(name='vector1', size=10)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
index c53f10e0a410b27d86b2415d98178c4790e0b0ba..345fb2b6aba24b70fcab48a1a9fe466b28cbbfb5 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a489a39da1f6408dbbef02546371f12e4340b62
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
@@ -0,0 +1,20 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='data', size=1024)
+
+fm = factorization_machine(input=data, factor_size=10)
+
+outputs(fm)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_fc.py b/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
index 2842d3429c9c917845f8f4c33d3618608d40291d..90b0e37270f4f00e19dab35d5baa36a9902c6019 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
index 9dab45519c65b0ca686558ec7fe2064bb9ad8824..2bd4ab2da4e98726a0f35186c3582d4623ca4779 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 data = data_layer(name='input', size=256)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
index 474e4f36bad7eab13251afe265d1a7d107549efd..451909ee183ec4760613b168420433bfc919ebb8 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-4)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py b/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
index dff1c535b3e84e14d0e7c343efe911f19872280a..3ebe40aadc7fb89016ee632d3ff19dc9a4b52fce 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
index 171da10f75dae03eed7e110d0efd07d6a18e1ecf..c762467febc5c0fc29437ca07d3793d971f12045 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/env python
 #coding=utf-8
 from paddle.trainer_config_helpers import *
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
index b36a5c6d1222860ee4b77f89ad4b6148ccd89589..58bf3de104641cb69b31fd64621f72a5df663408 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 outputs(
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
index 7ca1cc2db365dedda5d9673cafaa851a464a7b6b..8d570706dfbf672933b1f6ab78b95f19bc1033f5 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
index eb14270baa0c4ca0b84d2121a80fde0b45eda54a..3b6117d297a2942fda822d5eb71e54a3fc989d06 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py
index c3376c47bded5a3aad15331936a61e12ac883b17..083d0643678bbc53429b9722f33427e5e21df2bb 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py
index d250001932547d63a70de05940957f90cc014dfb..9c1445584196985759bace73a9b9ba7b91f1870a 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
index b7a15666f0a5b863cbafec5f73dcfe0b9db2e0c7..046698fb4e2a17c738904dd90900d077285ec8e8 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
index 491e8c8caab38eb7c24e5461107ab5a9d63b12ef..1046db2f098b7eda5f428ad92d740338e791f780 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
index 0dbb921d41986e711d5b8b31caab1f8b6bdc47b8..37805d43767ba9fb9f7a2e9790332275b0475b1a 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=100, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
index aae90fab32db78a70c2169ed8fafb930433f4136..10d759f6d90a70aad50561506f9bf49ae9737441 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
@@ -1,8 +1,23 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
-data = data_layer(name='input', size=300)
-prelu = prelu_layer(input=data)
-prelu = prelu_layer(input=data, partial_sum=1)
-prelu = prelu_layer(input=data, partial_sum=5)
+data = data_layer(name='input', size=300, height=10, width=10)
+prelu = prelu_layer(input=data, num_channels=3)
+prelu = prelu_layer(input=data, partial_sum=1, num_channels=3)
+prelu = prelu_layer(input=data, partial_sum=5, num_channels=3)
+prelu = prelu_layer(input=data, channel_shared=True, num_channels=3)
+prelu = prelu_layer(input=data, channel_shared=False, num_channels=3)
 
 outputs(prelu)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
index 8da26ff44b19d0c18efae201a3b39002555d2605..22e0ce3e5acf33291ce71c5ecd622ec378e35b85 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py b/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
index 1a693f8dff06dec6e71eeb488da9c807c35e4c9b..d1d97f1c5e1373c677871aa7faf3e29b8c67e68c 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
index 004e2a5dd4efa9feab7619643673b37fe28146c5..6818b91f969c3993cf260e7472942937fe3b9660 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
index 09a6f507338c1da8e9ce60555f8ca2576704170c..ce8a22ebb170428e97905e941f443b43cff9ee42 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 data = data_layer(name='input', size=300)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
index 91010759e4847f087eb4e05ad98ae794a2129365..79dad5e2508395555bc497c66c7632906fc5cbe0 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
index b739a81b8505c94a2312ac735647fb114982f1f7..264341f899e161a4fea0c4ca2ae736f4154a703f 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 data = data_layer(name='data', size=3 * 14 * 14, height=14, width=14)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py b/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
index ab33c496b0663d8472ce4b272be6c5cecbcfc978..342a5029a834325d3851c2f127d3f520e2a91a58 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
index ac8badb26a40e96e75225e6f61aa536cd28e9098..9521fa6c471a1e6f396c36553cdcd4602b4c5901 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 data = data_layer(name='input', size=300)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
index dd589116fa9932144ca066d3fa4c929d1433a7f1..698d19d037580ea00911030963b627e4bf560874 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 data = data_layer(name='data', size=100)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
index 8d4bf28bf1eaf58e1fd0eb62fd10efe998587edd..22fb25d0f23d8bf401682812b58a1a1d9165d272 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
index 5c161ba805fb301e8feb8702ad61a8341df40e3f..1883ed9d4ed428b5a986596cc8e2cda19a400375 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
index 510ad3220893fddac278ba691307d00d57e440a3..12d7f1f33b1ce58abcaa415ce8ac827671a023e1 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/env python
 #coding=utf-8
 from paddle.trainer_config_helpers import *
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
index 3c205eabd80492a68383fdbecd14a7d6db3e16eb..8cf5fd70e3fd09a66f8b5d78d09c259112a9dec3 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py b/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
index 66629662dd9166766daaf707409b720f56ef1405..7188d82a534e6d37d2d4be7af35433fab19e260b 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 data = data_layer(name='input', size=300)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py b/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py
index 318b4459bab7a70ddec534c4ad217161ffc72d5a..a628272196a018f756f3e467fd8fb5db2aa56284 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 define_py_data_sources2(
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
index e0b0d0d3be252700d99f7097f0353df885efcf07..58c1675e6b6ea6288376a44d7ce9541d788187e0 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=100, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py
index 6d1c3175ba9801d69f3f9cb9e754858253192270..64d1d7b6eebe9cf18df82c2ca800794181b701b4 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/env python
 #coding=utf-8
 from paddle.trainer_config_helpers import *
diff --git a/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py b/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
index ebb39219bdc1fa314e1d70bcda902f71296772f6..6294cb04ef5165eb4fc79dd241b03dda1453cc98 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 settings(batch_size=1000, learning_rate=1e-4)
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/util_layers.py b/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
index 27f1c8e9938cdec12fccb37a3127bba1f8ee8d04..89b881b3611d97f55e3299f761d1dcda946ba076 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
diff --git a/python/paddle/utils/dump_v2_config.py b/python/paddle/utils/dump_v2_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dc2111e379fd39b40e1e9bcf2e577b57b101a68
--- /dev/null
+++ b/python/paddle/utils/dump_v2_config.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+
+from paddle.trainer_config_helpers.layers import LayerOutput
+from paddle.v2.layer import parse_network
+from paddle.proto import TrainerConfig_pb2
+
+__all__ = ["dump_v2_config"]
+
+
+def dump_v2_config(topology, save_path, binary=False):
+    """ Dump the network topology to a specified file.
+
+    This function is only used to dump network defined by using PaddlePaddle V2
+    APIs. This function will NOT dump configurations related to PaddlePaddle
+    optimizer.
+
+    :param topology: The output layers (can be more than one layers given in a
+                     Python List or Tuple) of the entire network. Using the
+                     specified layers (if more than one layer is given) as root,
+                     traversing back to the data layer(s), all the layers
+                     connected to the specified output layers will be dumped.
+                     Layers not connceted to the specified will not be dumped.
+    :type topology: LayerOutput|List|Tuple
+    :param save_path: The path to save the dumped network topology.
+    :type save_path: str
+    :param binary: Whether to dump the serialized network topology or not.
+                   The default value is false. NOTE that, if you call this
+                   function to generate network topology for PaddlePaddle C-API,
+                   a serialized version of network topology is required. When
+                   using PaddlePaddle C-API, this flag MUST be set to True.
+    :type binary: bool
+    """
+
+    if isinstance(topology, LayerOutput):
+        topology = [topology]
+    elif isinstance(topology, collections.Sequence):
+        for out_layer in topology:
+            assert isinstance(out_layer, LayerOutput), (
+                "The type of each element in the parameter topology "
+                "should be LayerOutput.")
+    else:
+        raise RuntimeError("Error input type for parameter topology.")
+
+    model_str = parse_network(topology)
+    with open(save_path, "w") as fout:
+        if binary:
+            fout.write(model_str.SerializeToString())
+        else:
+            fout.write(str(model_str))
diff --git a/python/paddle/utils/image_multiproc.py b/python/paddle/utils/image_multiproc.py
index e8db525ff5c388aef1a39d8db56633d509cb4fb9..1acf40df58e7bf5db1a76376037fd06372c9fa8b 100644
--- a/python/paddle/utils/image_multiproc.py
+++ b/python/paddle/utils/image_multiproc.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import os, sys
 import numpy as np
 from PIL import Image
diff --git a/python/paddle/utils/merge_model.py b/python/paddle/utils/merge_model.py
index 421e953d2775f145800cf7179ec644697a265060..2b100207728a8532e900992f7db4d3910e893dea 100644
--- a/python/paddle/utils/merge_model.py
+++ b/python/paddle/utils/merge_model.py
@@ -30,7 +30,8 @@ def merge_v2_model(net, param_file, output_file):
     which ends with .tar.gz.
 
     @param  net            The output layer of the network for inference.
-    @param  param_file     Path of the parameters (.tar.gz) which is stored by v2 api.
+    @param  param_file     Path of the parameters (.tar.gz) which is stored by
+                           v2 api.
     @param  output_file    Path of the merged file which will be generated.
 
     Usage:
diff --git a/python/paddle/utils/plotcurve.py b/python/paddle/utils/plotcurve.py
index 27bd8157d39632913e2fa3278f3af20ddea61da7..27a69b6a5c8b42e6b4c829e3ceebe11bed63ad15 100644
--- a/python/paddle/utils/plotcurve.py
+++ b/python/paddle/utils/plotcurve.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 #!/usr/bin/python
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 7bbe3eaaa67a117bc53571e6571365c3a26814c1..df710c33d0c0ca16d358dac1eb42327e9cd4c7ae 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -62,6 +62,68 @@ __all__ = [
 cp.begin_parse()
 
 
+def set_env_vars(trainer_count):
+    '''Auto set CPU environment if have not set before.
+       For MKL:
+         export KMP_AFFINITY, OMP_DYNAMIC according to the Hyper Threading status.
+         export OMP_NUM_THREADS, MKL_NUM_THREADS according to trainer_count.
+       For OpenBLAS:
+         export OPENBLAS_NUM_THREADS, OPENBLAS_MAIN_FREE according to trainer_count. 
+    '''
+    import platform, paddle
+    if not platform.system() in ['Linux', 'Darwin']:
+        return
+
+    def set_env(key, value):
+        '''If the key has not been set in the environment, set it with value.'''
+        assert isinstance(key, str)
+        assert isinstance(value, str)
+        envset = os.environ.get(key)
+        if envset is None:
+            os.environ[key] = value
+
+    def num_physical_cores():
+        '''Get the number of physical cores'''
+        if platform.system() == "Linux":
+            num_sockets = int(
+                os.popen("grep 'physical id' /proc/cpuinfo | sort -u | wc -l")
+                .read())
+            num_cores_per_socket = int(
+                os.popen("grep 'core id' /proc/cpuinfo | sort -u | wc -l")
+                .read())
+            return num_sockets * num_cores_per_socket
+        else:
+            cmds = {"Darwin": "sysctl -n hw.physicalcpu"}
+            return int(os.popen(cmds.get(platform.system(), "expr 1")).read())
+
+    def num_logical_processors():
+        '''Get the number of logical processors'''
+        cmds = {
+            "Linux": "grep \"processor\" /proc/cpuinfo|sort -u|wc -l",
+            "Darwin": "sysctl -n hw.logicalcpu"
+        }
+        return int(os.popen(cmds.get(platform.system(), "expr 1")).read())
+
+    num_cores = num_physical_cores()
+    num_processors = num_logical_processors()
+    if paddle.version.mkl() == 'ON':
+        if num_processors > num_cores:  # Hyper Threading is enabled
+            set_env("OMP_DYNAMIC", "true")
+            set_env("KMP_AFFINITY", "granularity=fine,compact,1,0")
+        else:
+            set_env("OMP_DYNAMIC", "false")
+            set_env("KMP_AFFINITY", "granularity=fine,compact,0,0")
+    threads = num_processors / trainer_count
+    threads = '1' if threads < 1 else str(threads)
+    if paddle.version.mkl() == 'ON':
+        set_env("OMP_NUM_THREADS", threads)
+        set_env("MKL_NUM_THREADS", threads)
+    else:
+        set_env("OPENBLAS_NUM_THREADS", threads)
+        if threads > 1:
+            set_env("OPENBLAS_MAIN_FREE", '1')
+
+
 def init(**kwargs):
     import py_paddle.swig_paddle as api
     args = []
@@ -76,35 +138,14 @@ def init(**kwargs):
     for key in args_dict.keys():
         args.append('--%s=%s' % (key, str(args_dict[key])))
 
-    # auto set cpu environment
-    def set_env(key, value):
-        '''If the key has not been set in the environment, set it with value.'''
-        assert isinstance(key, str)
-        assert isinstance(value, str)
-        envset = os.environ.get(key)
-        if envset is None:
-            os.environ[key] = value
-
-    ht = os.popen("lscpu |grep \"per core\"|awk -F':' '{print $2}'|xargs")
-    ht = int(ht.read())
-    if ht == 1:  # ht is off
-        set_env("OMP_DYNAMIC", "false")
-        set_env("KMP_AFFINITY", "granularity=fine,compact,0,0")
-    else:
-        set_env("OMP_DYNAMIC", "true")
-        set_env("KMP_AFFINITY", "granularity=fine,compact,1,0")
-    processors = os.popen("grep \"processor\" /proc/cpuinfo|sort -u|wc -l")
-    processors = int(processors.read())
-    trainers = kwargs.get('trainer_count', 1)
-    threads = processors / trainers
-    threads = '1' if threads < 1 else str(threads)
-    set_env("OMP_NUM_THREADS", threads)
-    set_env("MKL_NUM_THREADS", threads)
+    set_env_vars(kwargs.get('trainer_count', 1))
 
     if 'use_gpu' in kwargs:
         cp.g_command_config_args['use_gpu'] = kwargs['use_gpu']
     if 'use_mkldnn' in kwargs:
         cp.g_command_config_args['use_mkldnn'] = kwargs['use_mkldnn']
+    if 'use_mkl_packed' in kwargs:
+        cp.g_command_config_args['use_mkl_packed'] = kwargs['use_mkl_packed']
     assert 'parallel_nn' not in kwargs, ("currently 'parallel_nn' is not "
                                          "supported in v2 APIs.")
 
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index e31e501ce93c5dc20693a8724ee7dd864f9aef55..fab8a68b0beee8b813bee2a05047e2da526a9c9b 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -23,7 +23,6 @@ import paddle.v2.dataset
 import cPickle
 import glob
 import cPickle as pickle
-import random
 
 __all__ = [
     'DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader',
@@ -71,7 +70,7 @@ def download(url, module_name, md5sum):
         if retry < retry_limit:
             retry += 1
         else:
-            raise RuntimeError("Cannot download {0} within retry limit {2}".
+            raise RuntimeError("Cannot download {0} within retry limit {1}".
                                format(url, retry_limit))
         print "Cache file %s not found, downloading %s" % (filename, url)
         r = requests.get(url, stream=True)
@@ -206,7 +205,6 @@ def convert(output_path, reader, line_count, name_prefix):
     indx_f = 0
 
     def write_data(indx_f, lines):
-        random.shuffle(lines)
         filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
         writer = recordio.writer(filename)
         for l in lines:
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
index 634388094c804827657dc83d5c205e680625b156..7bdddeaabec733ef26b3f766c6437f5c53d65044 100644
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -44,7 +44,7 @@ __all__ = ['train', 'test', 'valid']
 DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
 LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
 SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
-DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
+DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
 LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
 SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
 # In official 'readme', tstid is the flag of test data
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
index cfc1c886e1389c15e3f803c341b6f62dd7b4bf41..37c4296f9bcea7e16daa46f778934331513c30c4 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -23,10 +23,8 @@ Besides, this module also provides API for building dictionary.
 import paddle.v2.dataset.common
 import collections
 import tarfile
-import Queue
 import re
 import string
-import threading
 
 __all__ = ['build_dict', 'train', 'test', 'convert']
 
@@ -74,47 +72,20 @@ def build_dict(pattern, cutoff):
     return word_idx
 
 
-def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size):
+def reader_creator(pos_pattern, neg_pattern, word_idx):
     UNK = word_idx['<unk>']
+    INS = []
 
-    qs = [Queue.Queue(maxsize=buffer_size), Queue.Queue(maxsize=buffer_size)]
-
-    def load(pattern, queue):
+    def load(pattern, out, label):
         for doc in tokenize(pattern):
-            queue.put(doc)
-        queue.put(None)
+            out.append(([word_idx.get(w, UNK) for w in doc], label))
+
+    load(pos_pattern, INS, 0)
+    load(neg_pattern, INS, 1)
 
     def reader():
-        # Creates two threads that loads positive and negative samples
-        # into qs.
-        t0 = threading.Thread(
-            target=load, args=(
-                pos_pattern,
-                qs[0], ))
-        t0.daemon = True
-        t0.start()
-
-        t1 = threading.Thread(
-            target=load, args=(
-                neg_pattern,
-                qs[1], ))
-        t1.daemon = True
-        t1.start()
-
-        # Read alternatively from qs[0] and qs[1].
-        i = 0
-        doc = qs[i].get()
-        while doc != None:
-            yield [word_idx.get(w, UNK) for w in doc], i % 2
-            i += 1
-            doc = qs[i % 2].get()
-
-        # If any queue is empty, reads from the other queue.
-        i += 1
-        doc = qs[i % 2].get()
-        while doc != None:
-            yield [word_idx.get(w, UNK) for w in doc], i % 2
-            doc = qs[i % 2].get()
+        for doc, label in INS:
+            yield doc, label
 
     return reader
 
@@ -133,7 +104,7 @@ def train(word_idx):
     """
     return reader_creator(
         re.compile("aclImdb/train/pos/.*\.txt$"),
-        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000)
+        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
 
 
 def test(word_idx):
@@ -150,7 +121,7 @@ def test(word_idx):
     """
     return reader_creator(
         re.compile("aclImdb/test/pos/.*\.txt$"),
-        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000)
+        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
 
 
 def word_dict():
diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py
index b705c9109b2b6769c9fafa9241db5d81c682f9e3..d3b3dd524c34be660c5f2d4fc5ce2fa0420efbc1 100644
--- a/python/paddle/v2/dataset/mq2007.py
+++ b/python/paddle/v2/dataset/mq2007.py
@@ -24,7 +24,6 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20
 """
 
 import os
-import random
 import functools
 import rarfile
 from common import download
@@ -265,7 +264,7 @@ def query_filter(querylists):
     return filter_query
 
 
-def load_from_text(filepath, shuffle=True, fill_missing=-1):
+def load_from_text(filepath, shuffle=False, fill_missing=-1):
     """
   parse data file into querys
   """
@@ -287,17 +286,14 @@ def load_from_text(filepath, shuffle=True, fill_missing=-1):
             querylist._add_query(query)
     if querylist is not None:
         querylists.append(querylist)
-    if shuffle == True:
-        random.shuffle(querylists)
     return querylists
 
 
-def __reader__(filepath, format="pairwise", shuffle=True, fill_missing=-1):
+def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1):
     """
   Parameters
   --------
   filename : string
-  shuffle : shuffle query-doc pair under the same query
   fill_missing : fill the missing value. default in MQ2007 is -1
   
   Returns
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
index b0b9757c1a75d215cf8945b5cedbb1239fd43af7..7174413018cc29216a99c9291dee5ef723ea95f1 100644
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/v2/dataset/sentiment.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 # /usr/bin/env python
 # -*- coding:utf-8 -*-
 
diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/v2/dataset/tests/imikolov_test.py
index 4e52810e6b924e0796e3d836dbbcb27ede2c9e25..9b3ab72feb0d30cbd5c86d3add31ee46ccd4d653 100644
--- a/python/paddle/v2/dataset/tests/imikolov_test.py
+++ b/python/paddle/v2/dataset/tests/imikolov_test.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import paddle.v2.dataset.imikolov
 import unittest
 
diff --git a/python/paddle/v2/dataset/tests/test_sentiment.py b/python/paddle/v2/dataset/tests/test_sentiment.py
index 407405290734609059c1767600748d530e8a13a6..f107948801d09f2e4490c8187217322bd57d5edb 100644
--- a/python/paddle/v2/dataset/tests/test_sentiment.py
+++ b/python/paddle/v2/dataset/tests/test_sentiment.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 # /usr/bin/env python
 # -*- coding:utf-8 -*-
 
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index 98b97c75ca72f11c105535e0f2a5fa0201db5d42..f10bf7e42a1ead09b3eba0d61e55701215e4360f 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -38,6 +38,7 @@ UCI_TEST_DATA = None
 URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
 MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
 
+
 def feature_range(maximums, minimums):
     import matplotlib
     matplotlib.use('Agg')
@@ -114,7 +115,8 @@ def test():
 
 
 def model():
-    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar', MD5_MODEL)
+    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar',
+                                                 MD5_MODEL)
     with open(tar_file, 'r') as f:
         parameters = Parameters.from_tar(f)
     return parameters
diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py
index a0ffd31c545eb10dd8c2f14746ee90df58700e61..f322bffe133e9a726668495826eb145062f52ac7 100644
--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 """
 Testing and training events.
 
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index 5df612bf3530c843c16b337f2b8f83445fcf39b5..8c29ee741cbb8f484531c450dd99cf183f78178e 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -1,11 +1,96 @@
-import sys
-import core
-__all__ = ['proto']
-argv = []
-if core.is_compile_gpu():
-    argv = list(sys.argv) + [
-        "--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"
-    ]
-else:
-    argv = list(sys.argv) + ["--tryfromenv=use_pinned_memory"]
-core.init_gflags(argv)
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import print_function
+# import all class inside framework into fluid module
+import framework
+from framework import *
+# import all class inside executor into fluid module
+import executor
+from executor import *
+
+import io
+import evaluator
+import initializer
+import layers
+import nets
+import optimizer
+import backward
+import regularizer
+from param_attr import ParamAttr
+from data_feeder import DataFeeder
+from core import LoDTensor, CPUPlace, CUDAPlace
+from distribute_transpiler import DistributeTranspiler
+from distribute_transpiler_simple import SimpleDistributeTranspiler
+import clip
+from memory_optimization_transpiler import memory_optimize
+
+Tensor = LoDTensor
+__all__ = framework.__all__ + executor.__all__ + [
+    'io',
+    'initializer',
+    'layers',
+    'nets',
+    'optimizer',
+    'backward',
+    'regularizer',
+    'LoDTensor',
+    'CPUPlace',
+    'CUDAPlace',
+    'Tensor',
+    'ParamAttr'
+    'DataFeeder',
+    'clip',
+    'SimpleDistributeTranspiler',
+    'DistributeTranspiler',
+    'memory_optimize',
+]
+
+
+def __bootstrap__():
+    """
+    Enable reading gflags from environment variables.
+
+    Returns:
+        None
+    """
+    import sys
+    import core
+    import os
+
+    try:
+        num_threads = int(os.getenv('OMP_NUM_THREADS', '1'))
+    except ValueError:
+        num_threads = 1
+
+    if num_threads > 1:
+        print(
+            'WARNING: OMP_NUM_THREADS set to {0}, not 1. The computation '
+            'speed will not be optimized if you use data parallel. It will '
+            'fail if this PaddlePaddle binary is compiled with OpenBlas since'
+            ' OpenBlas does not support multi-threads.'.format(num_threads),
+            file=sys.stderr)
+        print('PLEASE USE OMP_NUM_THREADS WISELY.', file=sys.stderr)
+
+    os.environ['OMP_NUM_THREADS'] = str(num_threads)
+
+    read_env_flags = ['use_pinned_memory', 'check_nan_inf']
+    if core.is_compile_gpu():
+        read_env_flags += ['fraction_of_gpu_memory_to_use', 'op_sync']
+    core.init_gflags([sys.argv[0]] +
+                     ["--tryfromenv=" + ",".join(read_env_flags)])
+    core.init_glog(sys.argv[0])
+    core.init_devices()
+
+
+__bootstrap__()
diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py
index f188582178f667125ec95cd230100fdb10ce7e88..27cf637c48bbd5fb8578d1526530b484f3a2c523 100644
--- a/python/paddle/v2/fluid/backward.py
+++ b/python/paddle/v2/fluid/backward.py
@@ -1,57 +1,558 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.v2.fluid import framework as framework
+from . import core
+import collections
+import copy
 
-__all__ = ['append_backward_ops']
+__all__ = [
+    'append_backward',
+    'calc_gradient',
+]
 
 
-def append_backward_ops(loss, parameter_list=None, no_grad_set=None):
+def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
     """
-    Create and add gradient Operators in BlockDesc to compute
-    gradients of `loss` for parameters in parameter_list
+    Traverse all ops in op_descs[begin_idx : end_idx],
+    if any op has inputs/outputs named "old_name", rename it as 'new_name'
+    """
+    if begin_idx is None:
+        begin_idx = 0
+    if end_idx is None:
+        end_idx = len(op_descs)
+    for i in range(begin_idx, end_idx):
+        op_desc = op_descs[i]
+        if isinstance(op_desc, tuple):
+            op_desc = op_desc[0]
+        op_desc.rename_input(old_name, new_name)
+        op_desc.rename_output(old_name, new_name)
+
+
+def _create_op_desc_(op_type, inputs, outputs, attrs):
+    """
+    Create a C++ OpDesc object with specified inputs, outputs and attributes.
+    """
+    op_desc = core.OpDesc()
+    op_desc.set_type(op_type)
+    for para, args in inputs.iteritems():
+        op_desc.set_input(para, args)
+    for para, args in outputs.iteritems():
+        op_desc.set_output(para, args)
+    for name, val in attrs.iteritems():
+        if isinstance(val, framework.Block):
+            op_desc.set_block_attr(name, val.desc)
+        else:
+            op_desc.set_attr(name, val)
+    return op_desc
+
+
+def _infer_var_data_type_(grad_var_name, block):
+    """
+    Infer the data type of given grad variable
+    """
+    grad_var = block.desc.find_var(grad_var_name.encode("ascii"))
+    fwd_name = _strip_grad_suffix_(grad_var_name.encode("ascii"))
+    if block.desc.has_var_recursive(fwd_name):
+        fwd_var = block.desc.find_var_recursive(fwd_name.encode("ascii"))
+        grad_var.set_dtype(fwd_var.dtype())
+    else:
+        grad_var.set_dtype(core.DataType.FP32)
+
+
+def _all_in_set_(cands, s):
+    """
+    Test if all elements of 'cands' are in set 's'
+    """
+    if len(cands) == 0:
+        return False
+    for c in cands:
+        if not c in s:
+            return False
+    return True
+
+
+def _some_in_set_(cands, s):
+    """
+    Test if some elements of 'cands' are in set 's'
+    """
+    if len(cands) == 0:
+        return False
+    for c in cands:
+        if c in s:
+            return True
+    return False
+
+
+def _strip_grad_suffix_(name):
+    """
+    Strip the grad suffix from the given varibale name
+    e.g. x@GRAD ==> x
+         y@GRAD@RENAME@1 ==> y
+    """
+    pos = name.find(core.grad_var_suffix())
+    return name[:pos] if pos != -1 else name
+
+
+def _append_grad_suffix_(name):
+    """
+    Append grad suffix to the given variable name
+    e.g. x ==> x@GRAD
+    """
+    return name + core.grad_var_suffix()
+
+
+def _addup_repetitive_outputs_(op_descs):
+    """
+    In backward part, an variable may be the output of more than one ops.
+    In this case, the variable should be the accumulation of all the outputs.
+    `sum_op`s are added to implement the accumulate.
+    """
+    pending_sum_ops = []
+    var_rename_count = collections.defaultdict(int)
+    renamed_vars = collections.defaultdict(list)
+    for idx, op_desc in enumerate(op_descs):
+        for var_name in op_desc.input_arg_names():
+            if len(renamed_vars[var_name]) > 1:
+                pending_sum_ops.append(
+                    (_create_op_desc_("sum", {"X": renamed_vars[var_name]},
+                                      {"Out": [var_name]}, {}), idx))
+                renamed_vars[var_name] = [var_name]
+        for var_name in op_desc.output_arg_names():
+            if var_name == core.empty_var_name(
+            ) or var_name in op_desc.input_arg_names():
+                # empty variable or inplace op
+                continue
+            if len(renamed_vars[var_name]) == 0:
+                # it's the first time we get the variable
+                renamed_vars[var_name] = [var_name]
+            else:
+                if len(renamed_vars[var_name]) == 1:
+                    new_name = var_name + "@RENAME@" + \
+                        str(var_rename_count[var_name])
+                    var_rename_count[var_name] += 1
+                    # rename original var_name
+                    renamed_vars[var_name][0] = new_name
+                    _rename_arg_(op_descs, var_name, new_name, 0, idx)
+                    _rename_arg_(pending_sum_ops, var_name, new_name)
+
+                new_name = var_name + "@RENAME@" + \
+                    str(var_rename_count[var_name])
+                var_rename_count[var_name] += 1
+                op_desc.rename_output(var_name, new_name)
+                renamed_vars[var_name].append(new_name)
+    for var_name, inputs in renamed_vars.iteritems():
+        if len(inputs) > 1:
+            pending_sum_ops.append((_create_op_desc_(
+                "sum", {"X": inputs}, {"Out": [var_name]}, {}), len(op_descs)))
+    # sum_op descs are sorted according to their insert position
+    for p in reversed(pending_sum_ops):
+        op_descs.insert(p[1], p[0])
+
+    return op_descs
+
+
+def _remove_no_grad_branch_(op_descs, no_grad_set):
+    """
+    Remove unnecessary grad ops
+    A grad op can be removed in two cases:
+        1. all outputs of the grad op are in 'no_grad_set'
+        2. all grad inputs of the grad op are in 'no_grad_set'
+    """
+
+    def _op_can_be_removed_(op_desc, no_grad_set):
+        out_arg_names = op_desc.output_arg_names()
+        if len(out_arg_names) == 0 or _all_in_set_(out_arg_names, no_grad_set):
+            return True
+        if _all_in_set_(
+                filter(lambda name: name.find(core.grad_var_suffix()) != -1,
+                       op_desc.input_arg_names()), no_grad_set):
+            no_grad_set.union(out_arg_names)
+            return True
+        return False
+
+    # Remove ops whose outputs are all in no_grad_dict
+    op_descs = filter(
+        lambda op_desc: not _op_can_be_removed_(op_desc, no_grad_set), op_descs)
+    # Insert fill_zeros_like_op
+    to_insert = []
+    for idx, op_desc in enumerate(op_descs):
+        for arg in op_desc.input_arg_names():
+            if core.grad_var_suffix() in arg and arg in no_grad_set:
+                to_insert.append((_create_op_desc_("fill_zeros_like", {
+                    "X": [_strip_grad_suffix_(arg)]
+                }, {"Out": [arg]}, {}), idx))
+
+    map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert))
+
+    return op_descs
+
+
+def _append_backward_ops_(block,
+                          ops,
+                          target_block,
+                          no_grad_dict,
+                          grad_to_var,
+                          callback=None):
+    """
+    Create all grad ops, and insert them into given block
+
+    Args:
+        block(Block): the block where forward ops are
+        ops(Op): the forward operators whose backward ops need to be added
+        target_block(Block): the block which is going to hold new generated grad ops
+        no_grad_dict(dict):
+            key(int)  block index
+            val(set) a set of varibale names. These varibales have no gradient
+        grad_to_var(dict)(output argument):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+        callback(callable object): a callable object used to decorate new generated grad ops
+    """
+    if callback is None:
+
+        def empty_callback(block, context):
+            pass
+
+        callback = empty_callback
+    elif not hasattr(callback, '__call__'):
+        raise ValueError("'callback' must be a callable object.")
+
+    # grad_op_descs holds created grad_op, and will be appended to target_block
+    grad_op_descs = []
+    program = block.program
+    for op in reversed(ops):
+        grad_sub_block_list = []
+        # If the op has its own sub-block, deal with the sub-block first
+        if op.has_attr("sub_block"):
+            sub_block = program.block(op.block_attr("sub_block"))
+            grad_sub_block = program.create_block(parent_idx=sub_block.idx)
+            _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
+                                  no_grad_dict, grad_to_var)
+            grad_sub_block_list.append(grad_sub_block.desc)
+
+        # Getting op's corresponding grad_op
+        grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
+            op.desc, no_grad_dict[block.idx], grad_sub_block_list)
+
+        grad_op_descs.extend(grad_op_desc)
+        grad_to_var.update(op_grad_to_var)
+
+    grad_op_descs = _addup_repetitive_outputs_(grad_op_descs)
+
+    grad_op_descs = _remove_no_grad_branch_(grad_op_descs,
+                                            no_grad_dict[block.idx])
+
+    # append op_desc in grad_op_descs to target_block
+    for op_desc in grad_op_descs:
+        new_op_desc = target_block.desc.append_op()
+        new_op_desc.copy_from(op_desc)
+        callback(block=target_block, context=grad_to_var)
+
+
+def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
+    """
+    Create new variables required by backward pass.
 
-    :param loss: an variable generated by cost function.
-    :type loss: Variable
-    :param no_grad_set: variable that should not create gradient
-    :type no_grad_set: set
-    :param parameter_list: parameters that need to compute gradient and 
-    update to optimize the lost.
-    :type: list
-    :return: list of (parameters, gradients) pair.
-    :rtype: list[Variable]
+    Args:
+        block(Block): the block where new variables will be created
+        start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
+        grad_to_var(dict):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+            In most cases, this dict is generated by _append_backward_ops_()
+        grad_info_map(dict)(output argument):
+            key(str): forward variable name
+            val(tuple): a tuple of (str, Block), str is the corresponding grad name, Block is the block containing grad variable
+    """
+    for op_idx in range(start_op_idx, block.desc.op_size()):
+        op_desc = block.desc.op(op_idx)
+        if op_desc.has_attr("sub_block"):
+            sub_block = block.program.block(op_desc.block_attr("sub_block"))
+            _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
+        new_vars = set()
+        # create new gradient variables
+        for grad_var_name in op_desc.output_arg_names():
+            grad_var_name = grad_var_name.encode("ascii")
+            if block.desc.has_var_recursive(
+                    grad_var_name) or grad_var_name == core.empty_var_name():
+                continue
+            block.desc.var(grad_var_name)
+            new_vars.add(grad_var_name)
+            if not grad_to_var.has_key(grad_var_name):
+                continue
+            grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block)
+        # infer_shape and infer_type
+        op_desc.infer_var_type(block.desc)
+        op_desc.infer_shape(block.desc)
+        for arg in op_desc.output_arg_names():
+            if arg in new_vars:
+                _infer_var_data_type_(arg, block)
+
+
+def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
+    var_map = copy.copy(target_grad_map)
+    for op_idx in range(start_op_idx, block.desc.op_size()):
+        op_desc = block.desc.op(op_idx)
+        for name in op_desc.input_arg_names():
+            if name in var_map:
+                op_desc.rename_input(name, var_map[name])
+
+        for name in op_desc.output_arg_names():
+            if block.desc.find_var(name.encode("ascii")):
+                new_name = "%s_%s" % (name, core.unique_integer(name))
+                op_desc.rename_output(name, new_name)
+                var_map[name] = new_name
+
+    for g, ng in var_map.iteritems():
+        if g in grad_to_var:
+            grad_to_var[ng] = grad_to_var[g]
+            grad_to_var.pop(g)
+
+
+def _get_stop_gradients_(program):
+    no_grad_dict = dict()
+    assert isinstance(program, framework.Program)
+    for block in program.blocks:
+        assert isinstance(block, framework.Block)
+        block_no_grad_set = set()
+        for var in block.vars.itervalues():
+            assert isinstance(var, framework.Variable)
+            if var.stop_gradient:
+                block_no_grad_set.add(_append_grad_suffix_(var.name))
+        no_grad_dict[block.idx] = block_no_grad_set
+    return no_grad_dict
+
+
+def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None):
+    """
+    Append backward part to main_program
+
+    Args:
+        loss(Variable): The variable generated by cost function.
+        parameter_list(list[string]): Parameters that need to be updated by
+            optimizer. If None, it means all parameters need to be updated.
+        no_grad_set(set): Variables that have no gradients in Block 0.
+            All variables with `step_gradient=True` from all blocks will be
+            automatically added.
+
+    Return:
+        (list[(Variable,Variable)]): list of (parameter, gradient) pair.
     """
     assert isinstance(loss, framework.Variable)
 
+    program = loss.block.program
     if no_grad_set is None:
-        program = loss.block.program
-        assert isinstance(program, framework.Program)
-        no_grad_set = list()
-        for block in program.blocks:
-            assert isinstance(block, framework.Block)
-            for var in block.vars.itervalues():
-                assert isinstance(var, framework.Variable)
-                if var.stop_gradient:
-                    no_grad_set.append(var.name)
-        no_grad_set = set(no_grad_set)
-
-    param_grad_map = loss.block.program.append_backward(loss, no_grad_set)
+        no_grad_set = set()
+    no_grad_set = copy.copy(no_grad_set)
+    no_grad_dict = _get_stop_gradients_(program)
+    no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
+
+    grad_info_map = dict()
+    root_block = program.block(0)
+
+    fwd_op_num = root_block.desc.op_size()
+    current_block_idx = program.current_block_idx
+    grad_to_var = dict()
+
+    op_desc = _create_op_desc_("fill_constant", {}, {
+        "Out": [_append_grad_suffix_(loss.name)]
+    }, {"shape": [1],
+        "value": 1.0,
+        "dtype": loss.dtype})
+    root_block.desc.append_op().copy_from(op_desc)
+
+    block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
+    op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set)
+    no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
+
+    _append_backward_ops_(root_block, op_path, root_block, no_grad_dict,
+                          grad_to_var, callback)
+
+    # Because calc_gradient may be called multiple times,
+    # we need rename the internal gradient variables so that they have
+    # different names.
+    _rename_grad_(root_block, fwd_op_num, grad_to_var, {})
+
+    _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)
+
+    program.current_block_idx = current_block_idx
+    program.sync_with_cpp()
+
     if parameter_list is not None:
         parameters = parameter_list
     else:
-        params = loss.block.program.global_block().all_parameters()
+        params = program.global_block().all_parameters()
         parameters = [param.name for param in params]
+
     params_and_grads = []
     for param in parameters:
-        if param not in param_grad_map:
+        if param not in grad_info_map:
             raise ValueError("param %s is not in map" % param)
-        grad_info = param_grad_map[param]
-        grad_block = loss.block.program.block(grad_info[1])
+        grad_info = grad_info_map[param]
+        grad_block = grad_info[1]
         if not grad_block.has_var(grad_info[0]):
             raise ValueError("grad block[{0}] did not have grad var {1}".format(
                 grad_info[1], grad_info[0]))
         # Get the param var from the global block
-        param_var = loss.block.program.global_block().var(param)
+        param_var = program.global_block().var(param)
         grad_var = grad_block.var(grad_info[0])
         if loss.block.has_var(grad_info[0]):
             params_and_grads.append((param_var, grad_var))
         else:
             params_and_grads.append((param_var, None))
     return params_and_grads
+
+
+def _as_list(x):
+    if x is None:
+        return []
+    return list(x) if isinstance(x, collections.Sequence) else [x]
+
+
+def _find_op_path_(block, outputs, inputs, no_grad_set):
+    """
+    no_grad_set will also be changed
+    """
+    input_names = set([inp.name for inp in inputs])
+    output_names = set([out.name for out in outputs])
+
+    relevant_op_flags = [True] * len(block.ops)
+
+    # All the inputs of the block are used if inputs is empty,
+    if inputs:
+        for i, op in enumerate(block.ops):
+            if _some_in_set_(op.desc.input_arg_names(), input_names):
+                for name in op.desc.output_arg_names():
+                    if name not in no_grad_set:
+                        input_names.add(name)
+            else:
+                relevant_op_flags[i] = False
+
+    for i, op in reversed(list(enumerate(block.ops))):
+        if _some_in_set_(op.desc.output_arg_names(), output_names):
+            for name in op.desc.input_arg_names():
+                if name not in no_grad_set:
+                    output_names.add(name)
+        else:
+            relevant_op_flags[i] = False
+
+    op_path = [
+        block.ops[i] for i in range(len(block.ops)) if relevant_op_flags[i]
+    ]
+
+    if inputs:
+        for op in op_path:
+            for name in op.desc.input_arg_names():
+                if name not in input_names:
+                    no_grad_set.add(name)
+
+    return op_path
+
+
+def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
+    """
+    Backpropagate the graidents of targets to inputs.
+
+    Args:
+        targets(Variable|list[Variable]): The target variables
+        inputs(Variable|list[Variable]): The input variables
+        no_grad_set(set[string]): The names of variables that have no gradients
+            in Block 0. All variables with `stop_gradient=True` from all blocks
+            will be automatically added.
+
+    Return:
+        (list[Variable]): list of gradients for inputs
+        If an input does not affect targets, the corresponding gradient variable
+        will be None
+    """
+    targets = _as_list(targets)
+    inputs = _as_list(inputs)
+    target_gradients = _as_list(target_gradients)
+
+    block = targets[0].block
+    prog = block.program
+    block_idx = block.idx
+
+    if not target_gradients:
+        target_gradients = [None] * len(targets)
+
+    if len(targets) != len(target_gradients):
+        raise ValueError(
+            "Should have the same number of target_gradients as targets")
+
+    if no_grad_set is None:
+        no_grad_set = set()
+    no_grad_set = copy.copy(no_grad_set)
+    no_grad_dict = _get_stop_gradients_(prog)
+    no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
+
+    fwd_op_num = block.desc.op_size()
+
+    target_grad_map = {}
+    for i, grad in enumerate(target_gradients):
+        target = targets[i]
+        if grad is None:
+            grad_name = _append_grad_suffix_(target.name)
+            op_desc = _create_op_desc_("fill_constant_batch_size_like",
+                                       {"Input": [target.name]},
+                                       {"Out": [grad_name]}, {
+                                           "shape": target.shape,
+                                           "value": 1.0,
+                                           "dtype": target.dtype,
+                                           'input_dim_idx': 0,
+                                           'output_dim_idx': 0
+                                       })
+            block.desc.append_op().copy_from(op_desc)
+        else:
+            if target.block.idx != block_idx or target.block.program != prog:
+                raise ValueError("all targets must be in the same block")
+            if target.shape != grad.shape:
+                raise ValueError(
+                    "The shapes of target and grad are different: %s %s" % (
+                        target.name, grad.name))
+            target_grad_map[_append_grad_suffix_(target.name)] = grad.name
+
+    for input in inputs:
+        if input.block.program != prog:
+            raise "input must be in the same program as targets"
+
+    block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
+    op_path = _find_op_path_(block, targets, inputs, block_no_grad_set)
+    no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
+    grad_to_var = dict()
+    grad_info_map = dict()
+    _append_backward_ops_(block, op_path, block, no_grad_dict, grad_to_var)
+
+    # Because calc_gradient may be called multiple times,
+    # we need rename the internal gradient variables so that they have
+    # different names.
+    _rename_grad_(block, fwd_op_num, grad_to_var, target_grad_map)
+
+    _append_backward_vars_(block, fwd_op_num, grad_to_var, grad_info_map)
+    prog.sync_with_cpp()
+
+    grad_vars = []
+    for input_var in inputs:
+        if input_var.name not in grad_info_map:
+            grad_vars.append(None)
+        else:
+            grad_info = grad_info_map[input_var.name]
+            grad_block = grad_info[1]
+            grad_var = grad_block.var(grad_info[0])
+            grad_vars.append(grad_var)
+
+    if len(grad_vars) == 1:
+        return grad_vars[0]
+    else:
+        return grad_vars
diff --git a/python/paddle/v2/fluid/clip.py b/python/paddle/v2/fluid/clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4d9ed599ee297b2f3cf85f3eed716b1c49578d3
--- /dev/null
+++ b/python/paddle/v2/fluid/clip.py
@@ -0,0 +1,121 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import functools
+import layers
+from . import core
+
+__all__ = [
+    'GradientClipByValue',
+    'ErrorClipByValue',
+    'append_gradient_clip_ops',
+    'error_clip_callback',
+]
+
+
+class BaseErrorClipAttr(object):
+    def append_clip_op(self, block, grad_name):
+        raise NotImplementedError()
+
+
+class ErrorClipByValue(BaseErrorClipAttr):
+    def __init__(self, max, min=None):
+        max = float(max)
+        if min is None:
+            min = -max
+        else:
+            min = float(min)
+        self.max = max
+        self.min = min
+
+    def append_clip_op(self, block, grad_name):
+        clip_op_desc = block.desc.append_op()
+        clip_op_desc.set_type("clip")
+        clip_op_desc.set_input("X", [grad_name])
+        clip_op_desc.set_output("Out", [grad_name])
+        clip_op_desc.set_attr("min", self.min)
+        clip_op_desc.set_attr("max", self.max)
+
+
+def error_clip_callback(block, context):
+    # the context is a grad_to_var map
+    grad_to_var = context
+    op_desc = block.desc.op(block.desc.op_size() - 1)
+    for grad_n in filter(lambda n: grad_to_var.has_key(n),
+                         op_desc.output_arg_names()):
+        fwd_var = block.var_recursive(grad_to_var[grad_n])
+        error_clip = getattr(fwd_var, "error_clip", None)
+        if not (error_clip is None or isinstance(error_clip,
+                                                 BaseErrorClipAttr)):
+            raise TypeError(
+                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
+            )
+        if error_clip is not None:
+            error_clip.append_clip_op(block, grad_n)
+
+
+class BaseGradientClipAttr(object):
+    def process_context(self, context, p_g):
+        raise NotImplementedError()
+
+    def create_operators(self, param, grad):
+        raise NotImplementedError()
+
+
+class NullGradientClipAttr(BaseGradientClipAttr):
+    def process_context(self, context, p_g):
+        pass
+
+    def create_operators(self, param, grad):
+        return param, grad
+
+
+class GradientClipByValue(BaseGradientClipAttr):
+    def __init__(self, max, min=None):
+        max = float(max)
+        if min is None:
+            min = -max
+        else:
+            min = float(min)
+        self.max = max
+        self.min = min
+
+    def process_context(self, context, p_g):
+        pass
+
+    def create_operators(self, param, grad):
+        new_grad = layers.clip(x=grad, min=self.min, max=self.max)
+        return param, new_grad
+
+
+def append_gradient_clip_ops(param_grad):
+    context = dict()
+    create_op_callbacks = []
+    for p, g in param_grad:
+        clip_attr = getattr(p, 'clip_attr', NullGradientClipAttr())
+        if clip_attr is None:
+            clip_attr = NullGradientClipAttr()
+        if not isinstance(clip_attr, BaseGradientClipAttr):
+            raise TypeError(
+                "clip attribute should be an instance of BaseGradientClippingAttr"
+            )
+
+        clip_attr.process_context(context=context, p_g=param_grad)
+        create_op_callbacks.append(
+            functools.partial(
+                clip_attr.create_operators, param=p, grad=g))
+
+    return [each_callback() for each_callback in create_op_callbacks]
+
+
+ClipByValue = GradientClipByValue
diff --git a/python/paddle/v2/fluid/data_feeder.py b/python/paddle/v2/fluid/data_feeder.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfdd00e3ef77f5977926d452a648df5259f58823
--- /dev/null
+++ b/python/paddle/v2/fluid/data_feeder.py
@@ -0,0 +1,114 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import print_function
+import core
+import numpy
+import six.moves as six
+
+from framework import Variable, default_main_program
+
+__all__ = ['DataFeeder']
+
+
+class DataToLoDTensorConverter(object):
+    def __init__(self, place, lod_level, shape, dtype):
+        self.place = place
+        self.lod_level = lod_level
+        self.shape = shape
+        if dtype == core.DataType.FP32:
+            self.dtype = 'float32'
+        elif dtype == core.DataType.INT64:
+            self.dtype = 'int64'
+        elif dtype == core.DataType.FP64:
+            self.dtype = 'float64'
+        elif dtype == core.DataType.INT32:
+            self.dtype = 'int32'
+        else:
+            raise ValueError("dtype must be any of [int32, float32, int64, "
+                             "float64]")
+
+        self.data = []
+        self.lod = []
+
+        for i in six.range(lod_level):
+            self.lod.append([0])
+
+    def feed(self, data):
+        self._feed_impl_(data, self.lod, self.lod_level)
+
+    def _feed_impl_(self, data, lod, lod_level):
+        if lod_level == 0:
+            self.data.append(data)
+        else:
+            cur_lod_len = len(data)
+            lod[-1].append(lod[-1][-1] + cur_lod_len)
+            for each_data in data:
+                self._feed_impl_(each_data, lod[:-1], lod_level - 1)
+
+    def done(self):
+        arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape)
+        t = core.LoDTensor()
+        t.set(arr, self.place)
+        if self.lod_level > 0:
+            t.set_lod(self.lod)
+        return t
+
+
+class DataFeeder(object):
+    def __init__(self, feed_list, place, program=None):
+        self.feed_dtypes = []
+        self.feed_names = []
+        self.feed_shapes = []
+        self.feed_lod_level = []
+        if program is None:
+            program = default_main_program()
+        for each_var in feed_list:
+            if isinstance(each_var, basestring):
+                each_var = program.block(0).var(each_var)
+            if not isinstance(each_var, Variable):
+                raise TypeError("Feed list should contain a list of variable")
+            self.feed_dtypes.append(each_var.dtype)
+            self.feed_names.append(each_var.name)
+            shape = each_var.shape
+            batch_size_dim = -1
+            for i, s in enumerate(shape):
+                if s < 0:
+                    batch_size_dim = i
+                    break
+            if batch_size_dim == -1:
+                raise ValueError("Variable {0} must has a batch size dimension",
+                                 each_var.name)
+            self.feed_lod_level.append(each_var.lod_level)
+            self.feed_shapes.append(shape)
+
+        self.place = place
+
+    def feed(self, iterable):
+        converter = []
+        for lod_level, shape, dtype in six.zip(
+                self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
+            converter.append(
+                DataToLoDTensorConverter(
+                    place=self.place,
+                    lod_level=lod_level,
+                    shape=shape,
+                    dtype=dtype))
+
+        for each_sample in iterable:
+            for each_converter, each_slot in six.zip(converter, each_sample):
+                each_converter.feed(each_slot)
+        ret_dict = {}
+        for each_name, each_converter in six.zip(self.feed_names, converter):
+            ret_dict[each_name] = each_converter.done()
+        return ret_dict
diff --git a/python/paddle/v2/fluid/default_scope_funcs.py b/python/paddle/v2/fluid/default_scope_funcs.py
index 60c6165b6bd959f7bb3d92afed667f00f73f144f..2218bb140ac57bd3d7fa9cd0b97a9bcea48aa201 100644
--- a/python/paddle/v2/fluid/default_scope_funcs.py
+++ b/python/paddle/v2/fluid/default_scope_funcs.py
@@ -1,16 +1,29 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 """
 Default scope function.
 
-`Paddle` manages Scope as programming language's scope.  It just a 
-thread-local stack of Scope. Top of that stack is current scope, the bottom 
-of that stack is all scopes' parent. 
+`Paddle` manages Scope as programming language's scope.  It just a
+thread-local stack of Scope. Top of that stack is current scope, the bottom
+of that stack is all scopes' parent.
 
-Invoking `var/find_var`  can `new/find` variable in current scope. 
-Invoking `enter_local_scope/leave_local_scope` can create or destroy local 
-scope. 
+Invoking `var/find_var`  can `new/find` variable in current scope.
+Invoking `enter_local_scope/leave_local_scope` can create or destroy local
+scope.
 
-A `scoped_function` will take a `function` as input. That function will be 
-invoked in a new local scope. 
+A `scoped_function` will take a `function` as input. That function will be
+invoked in a new local scope.
 """
 
 import paddle.v2.fluid.core
@@ -19,8 +32,12 @@ import threading
 __tl_scope__ = threading.local()
 
 __all__ = [
-    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'var',
-    'find_var', 'scoped_function'
+    'get_cur_scope',
+    'enter_local_scope',
+    'leave_local_scope',
+    'var',
+    'find_var',
+    'scoped_function',
 ]
 
 
@@ -71,7 +88,7 @@ def find_var(name):
 def scoped_function(func):
     """
     invoke `func` in new scope.
-    
+
     :param func: a callable function that will be run in new scope.
     :type func: callable
     """
diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..06a7b6fb02f5e38e0762d112492854f027fe66ad
--- /dev/null
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -0,0 +1,509 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import print_function
+import framework
+from framework import Program, default_main_program, Parameter, Variable
+import optimizer
+from layer_helper import LayerHelper
+from distributed_spliter import *
+import math
+
+
+class VarBlock:
+    def __init__(self, varname, offset, size):
+        self.varname = varname
+        # NOTE: real offset is offset * size
+        self.offset = offset
+        self.size = size
+
+    def __str__(self):
+        return "%s:%d:%d" % (self.varname, self.offset, self.size)
+
+
+def split_dense_variable(var_list,
+                         pserver_count,
+                         min_block_size=1024,
+                         max_block_size=1048576):
+    """
+        We may need to split dense tensor to one or several blocks and put
+        them equally onto parameter server. One block is a sub-tensor
+        aligned by dim[0] of the tensor.
+        
+        We need to have a minimal block size so that the calculations in
+        the parameter server side can gain better performance. By default
+        mininum block size is 1024. The max block size is used to prevent
+        too large block that may causing send error.
+    """
+    blocks = []
+    for var in var_list:
+        split_count = pserver_count
+        var_numel = reduce(lambda x, y: x * y, var.shape)
+        max_pserver_count = int(math.floor(var_numel / float(min_block_size)))
+        if max_pserver_count == 0:
+            max_pserver_count = 1
+        if max_pserver_count < pserver_count:
+            split_count = max_pserver_count
+        block_size = int(math.ceil(var_numel / float(split_count)))
+
+        if len(var.shape) >= 2:
+            # align by dim1(width)
+            dim1 = reduce(lambda x, y: x * y, var.shape[1:])
+            remains = block_size % dim1
+            if remains != 0:
+                block_size += dim1 - remains
+        # update split_count after align
+        split_count = int(math.ceil(var_numel / float(block_size)))
+        for block_id in xrange(split_count):
+            curr_block_size = min(block_size, var_numel - (
+                (block_id) * block_size))
+            block = VarBlock(var.name, block_id, curr_block_size)
+            blocks.append(str(block))
+    return blocks
+
+
+class DistributeTranspiler:
+    def transpile(self,
+                  optimize_ops,
+                  params_grads,
+                  program=None,
+                  pservers="127.0.0.1:6174",
+                  trainers=1,
+                  split_method=round_robin):
+        """
+            Transpile the program to a distributed data-parallelism programs.
+            The main_program will be transform to use a remote parameter server
+            to do parameter optimization. And the optimization graph will be put
+            in to a parameter server program.
+
+            Use different methods to split trainable varialbles to different
+            parameter servers.
+
+            :param optimize_ops: op list of optimization, should be the
+                                 return value of Optimizer.minimize
+            :type optimize_ops: list
+            :param program: program to optimize, default default_main_program
+            :param pservers: parameter server endpoints like "m1:6174,m2:6174"
+            :type pservers: string
+            :return: return a list of programs
+        """
+        assert (callable(split_method))
+        if program is None:
+            program = default_main_program()
+        self.program = program
+        self.trainers = trainers
+        self.optimize_ops = optimize_ops
+        # steps to transpile:
+        # 1. split variable to multiple blocks, align by product(dim[1:]) (width).
+        # 2. modify trainer program add split_op to each Grad.
+        # 3. append send_op to trainer.
+        # 4. append concat_op to trainer to update local weights.
+        # 5. create new program as parameter server.
+        # 6. create parameter server program by split_method generated endpoint->VarBlock
+
+        pserver_endpoints = pservers.split(",")
+
+        # step1
+        param_list = [pg[0] for pg in params_grads]
+        grad_list = [pg[1] for pg in params_grads]
+        # TODO: add split selected rows support
+        grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
+        param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
+        # step2
+        grad_var_mapping = self._append_split_op(program, grad_blocks)
+
+        # step3
+        send_inputs = []
+        send_outputs = []
+        for b in grad_blocks:  # append by order
+            varname, block_id, _ = b.split(":")
+            send_inputs.append(grad_var_mapping[varname][int(block_id)])
+
+        param_var_mapping = self._create_vars_from_blocklist(program,
+                                                             param_blocks)
+        for b in param_blocks:
+            varname, block_id, _ = b.split(":")
+            send_outputs.append(param_var_mapping[varname][int(block_id)])
+        # let send_op know which endpoint to send which var, eplist is of the same
+        # order of send_inputs.
+        eplist = split_method(send_inputs, pserver_endpoints)
+        # create mapping of endpoint -> splited var to create pserver side program
+        self.param_grad_ep_mapping = dict()
+        for i, ep in enumerate(eplist):
+            param = send_outputs[i]
+            grad = send_inputs[i]
+            if not self.param_grad_ep_mapping.has_key(ep):
+                self.param_grad_ep_mapping[ep] = {"params": [], "grads": []}
+            self.param_grad_ep_mapping[ep]["params"].append(param)
+            self.param_grad_ep_mapping[ep]["grads"].append(grad)
+
+        send_op = program.global_block().append_op(
+            type="send",
+            inputs={"X": send_inputs},
+            outputs={"Out": send_outputs},
+            attrs={"endpoints": pserver_endpoints,
+                   "epmap": eplist})
+        # step4
+        for varname, splited_var in param_var_mapping.iteritems():
+            if len(splited_var) <= 1:
+                continue
+            orig_param = program.global_block().vars[varname]
+            concat = program.global_block().append_op(
+                type="concat",
+                inputs={"X": splited_var},
+                outputs={"Out": [orig_param]},
+                attrs={"axis": 0})
+
+    def _create_vars_from_blocklist(self, program, block_list):
+        block_map = dict()
+        var_mapping = dict()
+        for block_str in block_list:
+            varname, offset, size = block_str.split(":")
+            if not block_map.has_key(varname):
+                block_map[varname] = []
+            block_map[varname].append((long(offset), long(size)))
+        for varname, splited in block_map.iteritems():
+            orig_var = program.global_block().vars[varname]
+            var_mapping[varname] = []
+            if len(splited) == 1:
+                var_mapping[varname] = [orig_var]
+                continue
+            orig_shape = orig_var.shape
+            orig_dim1_flatten = 1
+            if len(orig_shape) >= 2:
+                orig_dim1_flatten = reduce(lambda x, y: x * y, orig_shape[1:])
+
+            for i, block in enumerate(splited):
+                size = block[1]
+                rows = size / orig_dim1_flatten
+                splited_shape = [rows]
+                if len(orig_shape) >= 2:
+                    splited_shape.extend(orig_shape[1:])
+                var = program.global_block().create_var(
+                    name="%s.block%d" % (varname, i),
+                    psersistable=False,
+                    dtype=orig_var.dtype,
+                    shape=splited_shape)  # flattend splited var
+                var_mapping[varname].append(var)
+        return var_mapping
+
+    def _clone_var(self, block, var):
+        assert isinstance(var, Variable)
+        return block.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            lod_level=var.lod_level,
+            # HACK: let all param in pserver persistable so child
+            # program in recv can get them
+            persistable=True)
+
+    def _append_split_op(self, program, gradblocks):
+        var_mapping = self._create_vars_from_blocklist(program, gradblocks)
+        for varname, splited_vars in var_mapping.iteritems():
+            # variable that don't need to split have empty splited_vars
+            if len(splited_vars) <= 1:
+                continue
+            orig_var = program.global_block().vars[varname]
+            sections = []
+            for v in splited_vars:
+                sections.append(v.shape[0])
+            program.global_block().append_op(
+                type="split",
+                inputs={"X": orig_var},
+                outputs={"Out": splited_vars},
+                attrs={"sections": sections}  # assume split evenly
+            )
+        return var_mapping
+
+    def get_trainer_program(self):
+        # remove optimize ops and add a send op to main_program
+        self.program.global_block().delete_ops(self.optimize_ops)
+        return self.program
+
+    def _create_var_for_trainers(self, block, var, trainers):
+        var_list = []
+        for i in xrange(trainers):
+            var_each = block.create_var(
+                name="%s.trainer_%d" % (var.name, i),
+                psersistable=var.persistable,
+                dtype=var.dtype,
+                shape=var.shape)
+            var_list.append(var_each)
+        return var_list
+
+    def _get_optimizer_input_shape(self, op_type, varkey, orig_shape,
+                                   param_shape):
+        """
+        Returns the shape for optimizer inputs that need to be reshaped when
+        Param and Grad is splited to multiple servers.
+        """
+        # HACK(typhoonzero): Should use functions of corresponding optimizer in
+        # optimizer.py to get the shape, do not  bind this in the transpiler.
+        if op_type == "adam":
+            if varkey in ["Moment1", "Moment2"]:
+                return param_shape
+        elif op_type == "adagrad":
+            if varkey == "Moment":
+                return param_shape
+        elif op_type == "adamax":
+            if varkey in ["Moment", "InfNorm"]:
+                return param_shape
+        elif op_type == "momentum":
+            if varkey == "Velocity":
+                return param_shape
+        elif op_type == "":
+            if varkey == "Moment":
+                return param_shape
+        elif op_type == "sgd":
+            pass
+        return orig_shape
+
+    def _is_op_on_pserver(self, endpoint, all_ops, idx):
+        """
+        Recursively check if the op need to run on current server.
+        Assume that ops are in the execution order.
+        """
+        param_names = [
+            p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
+        ]
+        op = all_ops[idx]
+        if op.inputs.has_key("Param"):
+            if op.inputs["Param"].name in param_names:
+                return True
+            else:
+                for n in param_names:
+                    if n.startswith(op.inputs["Param"].name+".block") and \
+                        n != op.inputs["Param"].name:
+                        return True
+                return False
+        else:
+            j = idx - 1
+            while j >= 0:
+                prev_op = all_ops[j]
+                prev_output_names = [o.name for o in prev_op.outputs.values()]
+                prev_input_names = [o.name for o in prev_op.inputs.values()]
+                found1 = False
+                found2 = False
+                for _, v in op.inputs.iteritems():
+                    if v.name in prev_output_names:
+                        found1 = self._is_op_on_pserver(endpoint, all_ops, j)
+                # later ops may produce output for prev op's next batch use.
+                for _, v in op.outputs.iteritems():
+                    if v.name in prev_input_names:
+                        found2 = self._is_op_on_pserver(endpoint, all_ops, j)
+                if found1 or found2:
+                    return True
+                j -= 1
+            return False
+
+    def _append_pserver_ops(self, program, pserver_program, opt_op, endpoint):
+        new_inputs = dict()
+        # update param/grad shape first, then other inputs like
+        # moment can use the updated shape
+        for key, var in opt_op.inputs.iteritems():
+            if key == "Grad":
+                grad_block = None
+                for g in self.param_grad_ep_mapping[endpoint]["grads"]:
+                    if g.name.startswith(var.name):
+                        grad_block = g
+                        break
+                if not grad_block:
+                    # do not append this op if current endpoint
+                    # is not dealing with this grad block
+                    return
+                merged_var = program.global_block().create_var(
+                    name=grad_block.name,
+                    persistable=grad_block.persistable,
+                    dtype=grad_block.dtype,
+                    shape=grad_block.shape)
+                # append merging ops if trainers > 1
+                if self.trainers > 1:
+                    vars2merge = self._create_var_for_trainers(
+                        program.global_block(), grad_block, self.trainers)
+                    program.global_block().append_op(
+                        type="sum",
+                        inputs={"X": vars2merge},
+                        outputs={"Out": merged_var})
+                    program.global_block().append_op(
+                        type="scale",
+                        inputs={"X": merged_var},
+                        outputs={"Out": merged_var},
+                        attrs={"scale": 1.0 / float(self.trainers)})
+                new_inputs[key] = merged_var
+            elif key == "Param":
+                # param is already created on global program
+                param_block = None
+                for p in self.param_grad_ep_mapping[endpoint]["params"]:
+                    if p.name.startswith(var.name):
+                        param_block = p
+                        break
+                if not param_block:
+                    return
+                tmpvar = program.global_block().create_var(
+                    name=param_block.name,
+                    persistable=True,
+                    dtype=param_block.dtype,
+                    shape=param_block.shape)
+
+                new_inputs[key] = tmpvar
+
+        for key, var in opt_op.inputs.iteritems():
+            if key in ["Param", "Grad"]:
+                continue
+            # update accumulator variable shape
+            param_shape = new_inputs["Param"].shape
+            new_shape = self._get_optimizer_input_shape(opt_op.type, key,
+                                                        var.shape, param_shape)
+            tmpvar = program.global_block().create_var(
+                name=var.name,
+                persistable=var.persistable,
+                dtype=var.dtype,
+                shape=new_shape)
+            new_inputs[key] = tmpvar
+            # create var in pserver program global block.
+            # TODO(typhoonzero): put blocks in one program to avoid create two
+            # variables.
+            pserver_program.global_block().create_var(
+                name=var.name,
+                persistable=var.persistable,
+                dtype=var.dtype,
+                shape=new_shape)
+
+        # change outputs ParamOut variable
+        opt_op.outputs["ParamOut"] = new_inputs["Param"]
+        program.global_block().append_op(
+            type=opt_op.type,
+            inputs=new_inputs,
+            outputs=opt_op.outputs,
+            attrs=opt_op.attrs)
+
+    def _append_pserver_non_opt_ops(self, program, pserver_program, opt_op):
+        for _, var in opt_op.inputs.iteritems():
+            program.global_block().create_var(
+                name=var.name,
+                persistable=var.persistable,
+                dtype=var.dtype,
+                shape=var.shape)
+            pserver_program.global_block().create_var(
+                name=var.name,
+                persistable=var.persistable,
+                dtype=var.dtype,
+                shape=var.shape)
+        program.global_block().append_op(
+            type=opt_op.type,
+            inputs=opt_op.inputs,
+            outputs=opt_op.outputs,
+            attrs=opt_op.attrs)
+
+    def get_pserver_program(self, endpoint, optimize_ops):
+        """
+        get pserver side program by endpoint
+
+        NOTE: assume blocks of the same variable is not distributed
+        on the same pserver, only change param/grad varnames for
+        trainers to fetch. For each pserver endpoint, server side
+        program must be a sub-set of the original optimization program.
+        """
+        # step5
+        pserver_program = Program()
+        for v in self.param_grad_ep_mapping[endpoint]["params"]:
+            self._clone_var(pserver_program.global_block(), v)
+        # step6
+        optimize_sub_program = Program()
+        for idx, opt_op in enumerate(optimize_ops):
+            is_op_on_pserver = self._is_op_on_pserver(endpoint, optimize_ops,
+                                                      idx)
+            if not is_op_on_pserver:
+                continue
+            if opt_op.inputs.has_key("Grad"):
+                self._append_pserver_ops(optimize_sub_program, pserver_program,
+                                         opt_op, endpoint)
+            else:
+                self._append_pserver_non_opt_ops(optimize_sub_program,
+                                                 pserver_program, opt_op)
+        pserver_program.global_block().append_op(
+            type="recv",
+            inputs={"RX": self.param_grad_ep_mapping[endpoint]["grads"]
+                    },  # grads to recv
+            outputs={},
+            attrs={
+                "OptimizeProgram": optimize_sub_program.desc,
+                "endpoint": endpoint,
+                "ParamList": [
+                    p.name
+                    for p in self.param_grad_ep_mapping[endpoint]["params"]
+                ],
+                "GradList": [
+                    p.name
+                    for p in self.param_grad_ep_mapping[endpoint]["grads"]
+                ],
+                "Trainers": self.trainers
+            })
+        pserver_program.sync_with_cpp()
+        return pserver_program
+
+    def get_startup_program(self, endpoint, pserver_program):
+        """
+        Get startup program for current parameter server.
+        Modify operator input variables if there are variables that
+        was splited to several blocks.
+        """
+        s_prog = Program()
+        orig_s_prog = framework.default_startup_program()
+        params = self.param_grad_ep_mapping[endpoint]["params"]
+
+        def _get_splited_name_and_shape(varname):
+            for idx, splited_param in enumerate(params):
+                pname = splited_param.name
+                if pname.startswith(varname) and varname != pname:
+                    return pname, splited_param.shape
+            return "", []
+
+        # 1. create vars in pserver program to startup program
+        pserver_vars = pserver_program.global_block().vars
+        created_var_map = dict()
+        for _, var in pserver_vars.iteritems():
+            tmpvar = s_prog.global_block().create_var(
+                name=var.name,
+                persistable=var.persistable,
+                dtype=var.dtype,
+                shape=var.shape)
+            created_var_map[var.name] = tmpvar
+
+        # 2. rename op outputs
+        for op in orig_s_prog.global_block().ops:
+            new_outputs = dict()
+            # do not append startup op if var is not on this pserver
+            op_on_pserver = False
+            for key, var in op.outputs.iteritems():
+                newname, _ = _get_splited_name_and_shape(var.name)
+                if newname:
+                    op_on_pserver = True
+                    new_outputs[key] = created_var_map[newname]
+                elif var.name in pserver_vars:
+                    op_on_pserver = True
+                    new_outputs[key] = pserver_vars[var.name]
+
+            if op_on_pserver:
+                if op.type in [
+                        "gaussian_random", "fill_constant", "uniform_random"
+                ]:
+                    op.attrs["shape"] = new_outputs["Out"].shape
+                s_prog.global_block().append_op(
+                    type=op.type,
+                    inputs=op.inputs,
+                    outputs=new_outputs,
+                    attrs=op.attrs)
+        return s_prog
diff --git a/python/paddle/v2/fluid/distribute_transpiler_simple.py b/python/paddle/v2/fluid/distribute_transpiler_simple.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd88f02bde0c6a58138e20db2b07cbd06cd40ba3
--- /dev/null
+++ b/python/paddle/v2/fluid/distribute_transpiler_simple.py
@@ -0,0 +1,255 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import framework
+from framework import Program, default_main_program, Parameter, Variable
+import optimizer
+from layer_helper import LayerHelper
+
+
+def hash_name_to_server(params_grads, pserver_endpoints):
+    """
+    :param param_grads:
+    :return: a map of pserver endpoint -> 
+                    params -> [param list]
+                    grads  -> [grad list]
+    """
+
+    def _hash_param(param_name, total):
+        return hash(param_name) % total
+
+    param_grad_map = dict()
+    for param, grad in params_grads:
+        if param.trainable is True and grad is not None:
+            server_id = _hash_param(param.name, len(pserver_endpoints))
+            server_for_param = pserver_endpoints[server_id]
+            if not param_grad_map.has_key(server_for_param):
+                param_grad_map[server_for_param] = {"params": [], "grads": []}
+            param_grad_map[server_for_param]["params"].append(param)
+            param_grad_map[server_for_param]["grads"].append(grad)
+
+    return param_grad_map
+
+
+def round_robin(params_grads, pserver_endpoints):
+    assert (len(params_grads) > len(pserver_endpoints))
+
+    param_grad_map = dict()
+    pserver_idx = 0
+    for param, grad in params_grads:
+        if param.trainable is True:
+            server_for_param = pserver_endpoints[pserver_idx]
+            if not param_grad_map.has_key(server_for_param):
+                param_grad_map[server_for_param] = {"params": [], "grads": []}
+
+            param_grad_map[server_for_param]["params"].append(param)
+            param_grad_map[server_for_param]["grads"].append(grad)
+
+            pserver_idx += 1
+            if pserver_idx >= len(pserver_endpoints):
+                pserver_idx = 0
+    return param_grad_map
+
+
+class SimpleDistributeTranspiler:
+    def transpile(self,
+                  optimize_ops,
+                  params_grads,
+                  program=None,
+                  pservers="127.0.0.1:6174",
+                  trainers=1,
+                  split_method=round_robin):
+        """
+            Transpile the program to a distributed data-parallelism programs.
+
+            The main_program will be transform to use a remote parameter server
+            to do parameter optimization. And the optimization graph will be put
+            in to a parameter server program.
+
+            Use different methods to split trainable varialbles to different
+            parameter servers.
+
+            Example to run:
+
+            exe = fluid.Executor(place)
+            t = fluid.DistributeTranspiler()
+            t.transpile(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1)
+
+            pserver_endpoint = os.getenv("PSERVER")
+            if pserver_endpoint:
+                pserver_prog = t.get_pserver_program(pserver_endpoint, optimize_ops)
+                exe.run(fluid.default_startup_program())
+                exe.run(pserver_prog)
+            else:
+                feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+                exe.run(fluid.default_startup_program())
+
+                for pass_id in range(PASS_NUM):
+                    ...
+
+            :param optimize_ops: op list of optimization, should be the
+                                 return value of Optimizer.minimize
+            :type optimize_ops: list
+            :param program: program to optimize, default default_main_program
+            :param pservers: parameter server endpoints like "m1:6174,m2:6174"
+            :type pservers: string
+
+            :return: return a list of programs
+        """
+        if program is None:
+            program = default_main_program()
+        self.program = program
+        self.trainers = trainers
+        self.optimize_ops = optimize_ops
+        self._optimize_distributed(
+            optimize_ops,
+            program,
+            params_grads,
+            pservers=pservers,
+            trainers=trainers,
+            split_method=split_method)
+
+    def _clone_param(self, block, v):
+        assert isinstance(v, Parameter)
+        new_p = Parameter(
+            block=block,
+            shape=v.shape,
+            dtype=v.dtype,
+            type=v.type,
+            lod_level=v.lod_level,
+            stop_gradient=v.stop_gradient,
+            trainable=v.trainable,
+            optimize_attr=v.optimize_attr,
+            regularizer=v.regularizer,
+            name=v.name)
+        block.vars[new_p.name] = new_p
+
+    def _clone_var(self, block, var):
+        assert isinstance(var, Variable)
+        return block.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            lod_level=var.lod_level,
+            persistable=var.persistable)
+
+    def _optimize_distributed(self, optimize_ops, program, params_and_grads,
+                              **kwargs):
+        if kwargs.has_key("split_method"):
+            split_method = kwargs["split_method"]
+        else:
+            split_method = round_robin
+
+        assert (callable(split_method))
+        pserver_endpoints = kwargs["pservers"].split(",")
+        self.param_grad_map = split_method(params_and_grads, pserver_endpoints)
+
+        send_op_ordered_inputs = []
+        send_op_ordered_outputs = []
+        epmap = []
+        for ep, v in self.param_grad_map.iteritems():
+            send_op_ordered_inputs.extend(v["grads"])
+            send_op_ordered_outputs.extend(v["params"])
+            for i in v["grads"]:
+                epmap.append(ep)
+        send_op = program.global_block().append_op(
+            type="send",
+            inputs={"X": send_op_ordered_inputs
+                    },  # inputs is a list of tensors to be send
+            outputs={"Out": send_op_ordered_outputs},
+            attrs={"endpoints": pserver_endpoints,
+                   "epmap": epmap})
+
+    def get_trainer_program(self):
+        # remove optimize ops and add a send op to main_program
+        self.program.global_block().delete_ops(self.optimize_ops)
+        return self.program
+
+    def _create_var_for_trainers(self, block, var, trainers):
+        var_list = []
+        for i in xrange(trainers):
+            var_each = block.create_var(
+                name="%s.trainer_%d" % (var.name, i),
+                psersistable=var.persistable,
+                dtype=var.dtype,
+                shape=var.shape)
+            var_list.append(var_each)
+        return var_list
+
+    def get_pserver_program(self, endpoint, optimize_ops):
+        pserver_program = Program()
+        for v in self.param_grad_map[endpoint]["params"]:
+            self._clone_param(pserver_program.global_block(), v)
+
+        optimize_sub_program = Program()
+        grad_var_names = [
+            var.name for var in self.param_grad_map[endpoint]["grads"]
+        ]
+        for opt_op in optimize_ops:
+            for _, var in opt_op.inputs.iteritems():
+                # NOTE: append operators to merge gradients from multiple
+                # trainers. If trainers == 1, this is not needed.
+                if self.trainers > 1 and var.name in grad_var_names:
+                    vars2merge = self._create_var_for_trainers(
+                        optimize_sub_program.global_block(), var, self.trainers)
+                    merged_var = optimize_sub_program.global_block().create_var(
+                        name=var.name,
+                        persistable=var.persistable,
+                        dtype=var.dtype,
+                        shape=var.shape)
+                    optimize_sub_program.global_block().append_op(
+                        type="sum",
+                        inputs={"X": vars2merge},
+                        outputs={"Out": merged_var})
+                    optimize_sub_program.global_block().append_op(
+                        type="scale",
+                        inputs={"X": merged_var},
+                        outputs={"Out": merged_var},
+                        attrs={"scale": 1.0 / float(self.trainers)})
+                else:
+                    optimize_sub_program.global_block().create_var(
+                        name=var.name,
+                        persistable=var.persistable,
+                        dtype=var.dtype,
+                        shape=var.shape)
+
+            if opt_op.inputs.has_key("Grad"):
+                if opt_op.inputs["Grad"].name in grad_var_names:
+                    optimize_sub_program.global_block().append_op(
+                        type=opt_op.type,
+                        inputs=opt_op.inputs,
+                        outputs=opt_op.outputs,
+                        attrs=opt_op.attrs)
+            else:
+                optimize_sub_program.global_block().append_op(
+                    type=opt_op.type,
+                    inputs=opt_op.inputs,
+                    outputs=opt_op.outputs,
+                    attrs=opt_op.attrs)
+        pserver_program.global_block().append_op(
+            type="recv",
+            inputs={"RX":
+                    self.param_grad_map[endpoint]["grads"]},  # grads to recv
+            outputs={},
+            attrs={
+                "OptimizeProgram": optimize_sub_program.desc,
+                "endpoint": endpoint,
+                "ParamList":
+                [p.name for p in self.param_grad_map[endpoint]["params"]],
+                "GradList":
+                [p.name for p in self.param_grad_map[endpoint]["grads"]],
+                "Trainers": self.trainers
+            })
+        pserver_program.sync_with_cpp()
+        return pserver_program
diff --git a/python/paddle/v2/fluid/distributed_spliter.py b/python/paddle/v2/fluid/distributed_spliter.py
new file mode 100644
index 0000000000000000000000000000000000000000..e647f760e9d3d400e28f54215b684079b2279ffc
--- /dev/null
+++ b/python/paddle/v2/fluid/distributed_spliter.py
@@ -0,0 +1,48 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+def hash_name(varlist, pserver_endpoints):
+    """
+    hash variable names to several endpoints.
+
+    :param varlist: a list of Variables
+    :return: a map of pserver endpoint -> varname
+    """
+
+    def _hash_block(block_str, total):
+        return hash(block_str) % total
+
+    eplist = []
+    for var in varlist:
+        server_id = _hash_block(var.name(), len(pserver_endpoints))
+        server_for_param = pserver_endpoints[server_id]
+        eplist.append(server_for_param)
+    return eplist
+
+
+def round_robin(varlist, pserver_endpoints):
+    """
+    distribute variables to several endpoints.
+    """
+    assert (len(varlist) > len(pserver_endpoints))
+
+    eplist = []
+    pserver_idx = 0
+    for var in varlist:
+        server_for_param = pserver_endpoints[pserver_idx]
+        eplist.append(server_for_param)
+
+        pserver_idx += 1
+        if pserver_idx >= len(pserver_endpoints):
+            pserver_idx = 0
+    return eplist
diff --git a/python/paddle/v2/fluid/evaluator.py b/python/paddle/v2/fluid/evaluator.py
index 3a8f1831cf2c44c81aee62c6ee172942db188217..adf174a07daeea521fa3a1c97273ec68b3a9a67f 100644
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -1,14 +1,34 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import numpy as np
-from paddle.v2.fluid.framework import Program, g_main_program, unique_name, Variable
-import paddle.v2.fluid.core as core
 
+import layers
+from framework import Program, unique_name, Variable, program_guard
+from layer_helper import LayerHelper
 
-def _clone_var_in_block_(block, var):
+__all__ = [
+    'Accuracy',
+    'ChunkEvaluator',
+]
+
+
+def _clone_var_(block, var):
     assert isinstance(var, Variable)
     return block.create_var(
         name=var.name,
         shape=var.shape,
-        dtype=var.data_type,
+        dtype=var.dtype,
         type=var.type,
         lod_level=var.lod_level,
         persistable=True)
@@ -16,172 +36,171 @@ def _clone_var_in_block_(block, var):
 
 class Evaluator(object):
     """
-    Evalutor Base class.
-
-    create metric states
-    add mini-batch evaluator caculate operator
-    add increment operator to accumulate the metric states
+    Base Class for all evaluators
+
+    Args:
+        name(str): The name of evaluator. such as, "accuracy". Used for generate
+            temporary variable name.
+        main_program(Program, optional): The evaluator should be added to this
+            main_program. Default default_main_program()
+        startup_program(Program, optional):The parameter should be added to this
+            startup_program. Default default_startup_program()
+
+    Attributes:
+        states(list): The list of state variables. states will be reset to zero
+            when `reset` is invoked.
+        metrics(list): The list of metrics variables. They will be calculate
+            every mini-batch
     """
 
     def __init__(self, name, **kwargs):
-        """
-        init the global states
-        """
-        self._states = {}
-        if kwargs.has_key("main_program"):
-            self._main_program = kwargs.get("main_program")
-        else:
-            self._main_program = g_main_program
-
-    def _update_ops(self, *args, **kwargs):
-        """
-        append update ops to the global states
-        """
-        raise NotImplementedError()
+        self.states = []
+        self.metrics = []
+        self.helper = LayerHelper(name, **kwargs)
 
     def reset(self, executor, reset_program=None):
         """
-        Clear metric states at the begin of each pass/user specified batch
+        reset metric states at the begin of each pass/user specified batch
         """
-        if reset_program == None:
+        if reset_program is None:
             reset_program = Program()
-        else:
-            reset_program = program
-        block = reset_program.global_block()
-        for k, var in self._states.iteritems():
-            g_var = _clone_var_in_block_(block, var)
-            zeros = block.create_var(dtype="float32", persistable=True)
-            block.append_op(
-                type="fill_constant",
-                outputs={"Out": [zeros]},
-                attrs={
-                    "shape": g_var.shape,
-                    "value": .0,
-                    "data_type": 5,
-                })
-            block.append_op(
-                type="scale", inputs={"X": zeros}, outputs={"Out": g_var})
-        executor.run(reset_program, fetch_list=self._states.values())
+
+        with program_guard(main_program=reset_program):
+            for var in self.states:
+                assert isinstance(var, Variable)
+                g_var = _clone_var_(reset_program.current_block(), var)
+                layers.fill_constant(
+                    shape=g_var.shape, value=0.0, dtype=g_var.dtype, out=g_var)
+
+        executor.run(reset_program)
 
     def eval(self, executor, eval_program=None):
         """
-        Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
+        Evaluate the statistics merged by multiple mini-batches.
         """
         raise NotImplementedError()
 
+    def create_state(self, suffix, dtype, shape):
+        """
+        Create state variable.
+
+        NOTE: It is not a public API.
+
+        Args:
+            suffix(str): the state suffix.
+            dtype(str|core.DataType): the state data type
+            shape(tuple|list): the shape of state
+
+        Returns: State variable
+
+        """
+        state = self.helper.create_variable(
+            name="_".join([unique_name(self.helper.name), suffix]),
+            persistable=True,
+            dtype=dtype,
+            shape=shape)
+        self.states.append(state)
+        return state
+
 
 class Accuracy(Evaluator):
     """
-    Accuracy need two state variable Total, Correct
+    Average Accuracy for multiple mini-batches.
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, input, label, k=1, **kwargs):
         super(Accuracy, self).__init__("accuracy", **kwargs)
-        block = self._main_program.global_block()
-        g_total = block.create_var(
-            name=unique_name("Total"),
-            persistable=True,
-            dtype="int64",
-            shape=[1])
-        g_correct = block.create_var(
-            name=unique_name("Correct"),
-            persistable=True,
-            dtype="int64",
-            shape=[1])
-        self._states["Total"] = g_total
-        self._states["Correct"] = g_correct
-
-    def _update_ops(self, input, label, k=1, **kwargs):
-        block = self._main_program.global_block()
-        topk_out = block.create_var(dtype=input.data_type)
-        topk_indices = block.create_var(dtype="int64")
-        block.append_op(
-            type="top_k",
-            inputs={"X": [input]},
-            outputs={"Out": [topk_out],
-                     "Indices": [topk_indices]},
-            attrs={"k": k})
-        acc_out = block.create_var(dtype=kwargs.get("out_dtype", "float32"))
-        correct = block.create_var(dtype="int64", persistable=True)
-        total = block.create_var(dtype="int64", persistable=True)
-        block.append_op(
-            type="accuracy",
-            inputs={
-                "Out": [topk_out],
-                "Indices": [topk_indices],
-                "Label": [label]
-            },
-            outputs={
-                "Accuracy": [acc_out],
-                "Correct": [correct],
-                "Total": [total],
-            })
-
-        block.append_op(
-            type="cast",
-            inputs={"X": [self._states["Total"]]},
-            outputs={"Out": [self._states["Total"]]},
-            attrs={
-                "in_data_type": 5,  # float32
-                "out_data_type": 2,  #int32
-            })
-        block.append_op(
-            type="cast",
-            inputs={"X": [self._states["Correct"]]},
-            outputs={"Out": [self._states["Correct"]]},
-            attrs={
-                "in_data_type": 5,
-                "out_data_type": 2,
-            })
-
-        block.append_op(
-            type="elementwise_add",
-            inputs={"X": [self._states["Total"]],
-                    "Y": [total]},
-            outputs={"Out": [self._states["Total"]]})
-        block.append_op(
-            type="elementwise_add",
-            inputs={"X": [self._states["Correct"]],
-                    "Y": [correct]},
-            outputs={"Out": [self._states["Correct"]]})
-
-        return acc_out
+        main_program = self.helper.main_program
+        if main_program.current_block().idx != 0:
+            raise ValueError("You can only invoke Evaluator in root block")
+
+        self.total = self.create_state(dtype='int64', shape=[1], suffix='total')
+        self.correct = self.create_state(
+            dtype='int64', shape=[1], suffix='correct')
+        total = self.helper.create_tmp_variable(dtype='int')
+        correct = self.helper.create_tmp_variable(dtype='int')
+        acc = layers.accuracy(
+            input=input, label=label, k=k, total=total, correct=correct)
+        total = layers.cast(x=total, dtype='int64')
+        correct = layers.cast(x=correct, dtype='int64')
+        layers.sums(input=[self.total, total], out=self.total)
+        layers.sums(input=[self.correct, correct], out=self.correct)
+
+        self.metrics.append(acc)
+
+    def eval(self, executor, eval_program=None):
+        if eval_program is None:
+            eval_program = Program()
+        block = eval_program.current_block()
+        with program_guard(main_program=eval_program):
+            total = _clone_var_(block, self.total)
+            correct = _clone_var_(block, self.correct)
+            total = layers.cast(total, dtype='float32')
+            correct = layers.cast(correct, dtype='float32')
+            out = layers.elementwise_div(x=correct, y=total)
+        return np.array(executor.run(eval_program, fetch_list=[out])[0])
+
+
+class ChunkEvaluator(Evaluator):
+    """
+    Accumulate counter numbers output by chunk_eval from mini-batches and
+    compute the precision recall and F1-score using the accumulated counter
+    numbers.
+    """
+
+    def __init__(
+            self,
+            input,
+            label,
+            chunk_scheme,
+            num_chunk_types,
+            excluded_chunk_types=None, ):
+        super(ChunkEvaluator, self).__init__("chunk_eval")
+        main_program = self.helper.main_program
+        if main_program.current_block().idx != 0:
+            raise ValueError("You can only invoke Evaluator in root block")
+
+        self.num_infer_chunks = self.create_state(
+            dtype='int64', shape=[1], suffix='num_infer_chunks')
+        self.num_label_chunks = self.create_state(
+            dtype='int64', shape=[1], suffix='num_label_chunks')
+        self.num_correct_chunks = self.create_state(
+            dtype='int64', shape=[1], suffix='num_correct_chunks')
+        precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval(
+            input=input,
+            label=label,
+            chunk_scheme=chunk_scheme,
+            num_chunk_types=num_chunk_types,
+            excluded_chunk_types=excluded_chunk_types, )
+        layers.sums(
+            input=[self.num_infer_chunks, num_infer_chunks],
+            out=self.num_infer_chunks)
+        layers.sums(
+            input=[self.num_label_chunks, num_label_chunks],
+            out=self.num_label_chunks)
+        layers.sums(
+            input=[self.num_correct_chunks, num_correct_chunks],
+            out=self.num_correct_chunks)
+
+        self.metrics.extend([precision, recall, f1_score])
 
     def eval(self, executor, eval_program=None):
-        if eval_program != None:
-            eval_program = eval_program
-        else:
+        if eval_program is None:
             eval_program = Program()
-        block = eval_program.global_block()
-        eval_out = block.create_var(dtype=self._states["Total"].data_type)
-        e_total = _clone_var_in_block_(block, self._states["Total"])
-        e_correct = _clone_var_in_block_(block, self._states["Correct"])
-        block.append_op(
-            type="cast",
-            inputs={"X": [e_total]},
-            outputs={"Out": [e_total]},
-            attrs={
-                "in_data_type": 2,  #int32
-                "out_data_type": 5,  #float32
-            })
-        block.append_op(
-            type="cast",
-            inputs={"X": [e_correct]},
-            outputs={"Out": [e_correct]},
-            attrs={
-                "in_data_type": 2,
-                "out_data_type": 5,
-            })
-        block.append_op(
-            type="elementwise_div",
-            inputs={"X": e_correct,
-                    "Y": e_total},
-            outputs={"Out": eval_out})
-        out = executor.run(eval_program, fetch_list=[eval_out])
-        return np.array(out[0])
-
-
-def accuracy(*args, **kwargs):
-    cls = Accuracy(*args, **kwargs)
-    out = cls._update_ops(*args, **kwargs)
-    return cls, out
+        block = eval_program.current_block()
+        num_infer_chunks, num_label_chunks, num_correct_chunks = executor.run(
+            eval_program,
+            fetch_list=[_clone_var_(block, state) for state in self.states])
+        num_infer_chunks = num_infer_chunks[0]
+        num_label_chunks = num_label_chunks[0]
+        num_correct_chunks = num_correct_chunks[0]
+        precision = float(
+            num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0
+        recall = float(
+            num_correct_chunks) / num_label_chunks if num_label_chunks else 0
+        f1_score = float(2 * precision * recall) / (
+            precision + recall) if num_correct_chunks else 0
+        return np.array(
+            [precision], dtype='float32'), np.array(
+                [recall], dtype='float32'), np.array(
+                    [f1_score], dtype='float32')
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index ed1c2c06daa7ede97e138049a1f7044d071c31e8..a99c5157b285d7edbf06398c00df3e7ec514cd91 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -1,9 +1,72 @@
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.framework import Block, Program, g_main_program
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import numpy as np
+import contextlib
+from framework import Program, default_main_program
+from . import core
+
+__all__ = ['Executor', 'global_scope', 'scope_guard', 'switch_scope']
 
 g_scope = core.Scope()
 
 
+def global_scope():
+    return g_scope
+
+
+def switch_scope(scope):
+    global g_scope
+    ex = g_scope
+    g_scope = scope
+    return ex
+
+
+@contextlib.contextmanager
+def scope_guard(scope):
+    ex = switch_scope(scope)
+    yield
+    switch_scope(ex)
+
+
+def as_numpy(tensor):
+    if isinstance(tensor, list):
+        return [as_numpy(t) for t in tensor]
+    assert isinstance(tensor, core.LoDTensor)
+    lod = tensor.lod()
+    tensor_data = np.array(tensor)
+    if len(lod) == 0:
+        ans = tensor_data
+    else:
+        raise RuntimeError("LoD Calculate lacks unit tests and buggy")
+    # elif len(lod) == 1:
+    #     ans = []
+    #     idx = 0
+    #     while idx < len(lod) - 1:
+    #         ans.append(tensor_data[lod[idx]:lod[idx + 1]])
+    #         idx += 1
+    # else:
+    #     for l in reversed(lod):
+    #         ans = []
+    #         idx = 0
+    #         while idx < len(l) - 1:
+    #             ans.append(tensor_data[l[idx]:l[idx + 1]])
+    #             idx += 1
+    #         tensor_data = ans
+    #     ans = tensor_data
+    return ans
+
+
 class Executor(object):
     def __init__(self, places):
         if not isinstance(places, list) and not isinstance(places, tuple):
@@ -15,7 +78,49 @@ class Executor(object):
             p.set_place(each)
             act_places.append(p)
 
-        self.executor = core.Executor(act_places)
+        # TODO(dzhwinter) : only use the first place
+        self.executor = core.Executor(act_places[0])
+        self.places = places
+
+    def aslodtensor(self, data):
+        def accumulate(data):
+            if not isinstance(data, list):
+                return 1
+            return sum([accumulate(sub) for sub in data])
+
+        def parselod(data):
+            seq_lens = [accumulate(seq) for seq in data]
+            cur_len = 0
+            lod = [cur_len]
+            for l in seq_lens:
+                cur_len += l
+                lod.append(cur_len)
+            return lod
+
+        assert len(self.places) != 0
+        if not isinstance(data, list):
+            # pure tensor case
+            tensor = core.LoDTensor()
+            tensor.set(data, self.places[0])
+            return tensor
+        else:
+            raise RuntimeError("Current implementation lacks unittests")
+            # lodtensor case
+            lod = []
+            if not isinstance(data[0], list):
+                lod.append(parselod(data))
+                flattened_data = np.concatenate(data, axis=0).astype("int64")
+            else:
+                while isinstance(data[0], list):
+                    lod.append(parselod(seq))
+                    flattened_data = [item for seq in data for item in seq]
+                    data = flattened_data
+                flattened_data = np.concatenate(data, axis=0).astype("int64")
+            flattened_data = flattened_data.reshape([len(flattened_data), 1])
+            tensor = core.LoDTensor()
+            tensor.set(flattened_data, self.places[0])
+            tensor.set_lod(lod)
+            return tensor
 
     def run(self,
             program=None,
@@ -23,20 +128,21 @@ class Executor(object):
             fetch_list=None,
             feed_var_name='feed',
             fetch_var_name='fetch',
-            scope=None):
+            scope=None,
+            return_numpy=True):
         if feed is None:
             feed = {}
         if fetch_list is None:
             fetch_list = []
 
         if program is None:
-            program = g_main_program
+            program = default_main_program()
 
         if not isinstance(program, Program):
             raise TypeError()
 
         if scope is None:
-            scope = g_scope
+            scope = global_scope()
 
         program = program.clone()
         global_block = program.global_block()
@@ -52,7 +158,10 @@ class Executor(object):
                 inputs={'X': [feed_var]},
                 outputs={'Out': [out]},
                 attrs={'col': i})
-            core.set_feed_variable(scope, feed[name], feed_var.name, i)
+            cur_feed = feed[name]
+            if not isinstance(cur_feed, core.LoDTensor):
+                cur_feed = self.aslodtensor(cur_feed)
+            core.set_feed_variable(scope, cur_feed, feed_var.name, i)
 
         fetch_var = global_block.create_var(
             name=fetch_var_name,
@@ -65,8 +174,12 @@ class Executor(object):
                 outputs={'Out': [fetch_var]},
                 attrs={'col': i})
 
-        self.executor.run(program.desc, scope, 0, True)
-        return [
+        self.executor.run(program.desc, scope, 0, True, True)
+        outs = [
             core.get_fetch_variable(scope, fetch_var_name, i)
             for i in xrange(len(fetch_list))
         ]
+
+        if return_numpy:
+            outs = as_numpy(outs)
+        return outs
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 7f7c310ad87f64e5d047ecfc2876d516914c75c8..8042febfed7ed7db2f6d1507142b17079aa00fd8 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -1,21 +1,72 @@
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import collections
+import contextlib
+
 import numpy as np
-import copy
+
+import proto.framework_pb2 as framework_pb2
+from . import core
 
 __all__ = [
-    'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
-    'default_main_program'
+    'Block',
+    'Variable',
+    'Program',
+    'Operator',
+    'default_startup_program',
+    'default_main_program',
+    'program_guard',
+    'switch_startup_program',
+    'switch_main_program',
 ]
 
+EMPTY_VAR_NAME = core.kEmptyVarName()
+TEMP_VAR_NAME = core.kTempVarName()
+GRAD_VAR_SUFFIX = core.kGradVarSuffix()
+ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
+
+
+def grad_var_name(var_name):
+    """
+    return gradient name for a certain var name
+    """
+    return var_name + GRAD_VAR_SUFFIX
+
 
 def unique_name(prefix):
+    """
+    Generate unique names with prefix
+
+    Args:
+        prefix(str): The prefix of return string
+
+    Returns(str): A unique string with the prefix
+
+    """
     uid = core.unique_integer(prefix)  # unique during whole process.
     return "_".join([prefix, str(uid)])
 
 
 def convert_np_dtype_to_dtype_(np_dtype):
+    """
+    Convert the data type in numpy to the data type in Paddle
+    Args:
+        np_dtype(np.dtype): the data type in numpy
+
+    Returns(core.DataType): the data type in Paddle
+
+    """
     dtype = np.dtype(np_dtype)
     if dtype == np.float32:
         return core.DataType.FP32
@@ -36,17 +87,33 @@ def convert_np_dtype_to_dtype_(np_dtype):
 
 
 def dtype_is_floating(dtype):
+    """
+    Check the data type is floating or not.
+    Args:
+        dtype(np.dtype|core.DataType): data type.
+            Could be numpy format or Paddle format
+
+    Returns(bool): True if data type is a float value
+
+    """
     if not isinstance(dtype, core.DataType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if (dtype == core.DataType.FP16 or dtype == core.DataType.FP32 or
-            dtype == core.DataType.FP64):
-        return True
-    else:
-        return False
+    return dtype in [core.DataType.FP16, core.DataType.FP32, core.DataType.FP64]
 
 
 def _debug_string_(proto, throw_on_error=True):
+    """
+    Get the debug string of a protobuf message. The message could be not
+    initialized.
+    Args:
+        proto(google.protobuf.message.Message): The protobuf message
+        throw_on_error(bool): True if raise an error when the protobuf message
+            is not initialized.
+
+    Returns(str): The debug string of the protobuf message
+
+    """
     error_fields = list()
     if not proto.IsInitialized(error_fields) and throw_on_error:
         raise ValueError("{0} are not initialized\nThe message is {1}".format(
@@ -55,6 +122,38 @@ def _debug_string_(proto, throw_on_error=True):
 
 
 class Variable(object):
+    """
+    Python variable. Every input and output of an operator is a variable. Every
+    variable belongs to a block. The variable has a name and two variables in
+    different blocks could have the same name.
+
+    There are many kinds of variables. Please reference the framework.proto for
+    details.
+
+    Notes: The constructor of Variable should not be invoked directly. Please
+    use `Block.create_var` to create a variable.
+
+    >>> cur_program = Program()
+    >>> cur_block = cur_program.current_block()
+    >>> new_variable = cur_block.create_var(
+    >>>                    name="X", shape=[-1, 23, 48], dtype='float32')
+
+    Args:
+        block(Block): The associated block. It will be passed by
+            `Block.create_var` automatically.
+        type(core.VarDesc.VarType): Variable type. Please reference the
+            framework.proto for details.
+        shape(tuple|list|None): The shape of variable. -1 means the batch size.
+            Some kinds of variable do not contain shape, just set it to None.
+        dtype(np.dtype|core.DataType|str): The data type of variable.
+        lod_level(int): The level of lod tensor. 0 means there is not a time
+            series data.
+        persistable(bool): True if the variable should be saved as check point.
+            Defaults to False.
+        stop_gradient(bool): True if the variable will stop to calculate
+            gradients when backward. Defaults to False.
+    """
+
     def __init__(self,
                  block,
                  type=core.VarDesc.VarType.LOD_TENSOR,
@@ -63,9 +162,11 @@ class Variable(object):
                  dtype=None,
                  lod_level=None,
                  persistable=None,
+                 error_clip=None,
                  stop_gradient=False,
                  **kwargs):
         self.block = block
+        self.error_clip = error_clip
 
         if name is None:
             name = Variable._unique_var_name_()
@@ -99,9 +200,9 @@ class Variable(object):
             if not isinstance(dtype, core.DataType):
                 dtype = convert_np_dtype_to_dtype_(dtype)
             if is_new_var:
-                self.desc.set_data_type(dtype)
+                self.desc.set_dtype(dtype)
             else:
-                old_dtype = self.data_type
+                old_dtype = self.dtype
                 if dtype != old_dtype:
                     raise ValueError("Variable {0} has been created before. "
                                      "The previous data type is {1}; the new "
@@ -138,12 +239,25 @@ class Variable(object):
         return self.to_string(True)
 
     def to_string(self, throw_on_error):
+        """
+        Get debug string.
+
+        Args:
+            throw_on_error(bool): True if raise an exception when self is not
+                intialized.
+
+        Returns(str): The debug string.
+
+        """
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.VarDesc.FromString(str(protostr))
         return _debug_string_(proto, throw_on_error)
 
     __repr__ = __str__
 
+    def set_desc(self, input):
+        self.desc = input
+
     @property
     def persistable(self):
         return self.desc.persistable()
@@ -162,8 +276,8 @@ class Variable(object):
         return tuple(self.desc.shape())
 
     @property
-    def data_type(self):
-        return self.desc.data_type()
+    def dtype(self):
+        return self.desc.dtype()
 
     @property
     def lod_level(self):
@@ -179,11 +293,16 @@ class Variable(object):
         uid = core.unique_integer(prefix)  # unique during whole process.
         return "_".join([prefix, str(uid)])
 
+    def set_error_clip(self, error_clip):
+        self.error_clip = error_clip
+
 
 def get_all_op_protos():
     """
     Get all registered op proto from PaddlePaddle C++ end.
-    :return: A list of registered OpProto.
+
+    Returns(list): list of OpProto
+
     """
     protostrs = core.get_all_op_protos()
     ret_values = []
@@ -194,6 +313,10 @@ def get_all_op_protos():
 
 
 class OpProtoHolder(object):
+    """
+    A global variable to hold all OpProtos from C++ as a map
+    """
+
     @classmethod
     def instance(cls):
         if not hasattr(cls, '_instance'):
@@ -210,12 +333,26 @@ class OpProtoHolder(object):
             self.op_proto_map[proto.type] = proto
 
     def get_op_proto(self, type):
+        """
+        Get OpProto by a type string.
+        Args:
+            type(str): The type that operator registered in C++ side.
+
+        Returns(framework_pb2.OpProto): The OpProto
+
+        """
         if type not in self.op_proto_map:
             raise ValueError("Operator \"%s\" has not been registered." % type)
         return self.op_proto_map[type]
 
 
 class Operator(object):
+    """
+    Python Operator class. The operator represents the build in instructs in a
+    Block. Users can use the build in instructs to describe their neural
+    network.
+    """
+
     def __init__(self,
                  block,
                  desc,
@@ -223,8 +360,36 @@ class Operator(object):
                  inputs=None,
                  outputs=None,
                  attrs=None):
+        """
+        Constructor.
+
+        Notes: The constructor of operator should not be invoked directly. Use
+        Block.append_op or Block.prepend_op instead.
+
+        >>> cur_program = Program()
+        >>> cur_block = cur_program.current_block()
+        >>> # var1 += var2 + var3
+        >>> cur_block.append_op(type="sum",
+        >>>                     inputs={"X": [var1, var2, var3]},
+        >>>                     outputs={"Out": [var1]})
+
+        Args:
+            block(Block): The block has the current operator
+            desc(core.OpDesc): The protobuf description
+            type(str): The type of operator.
+            inputs(dict): The input dictionary. Key is the input parameter name.
+                Value is a list of variables.
+            outputs(dict): The output dictionary. Has same format with inputs
+            attrs(dict): The attributes dictionary. Key is attribute name. Value
+                is the attribute value. The attribute type should be as same as
+                the type registered in C++
+        """
         self.block = block
         self.desc = desc
+        # for clone a new operator
+        self.inputs = inputs
+        self.outputs = outputs
+        self.attrs = attrs
         if len(self.desc.type()) != 0:
             return
         if type is None:
@@ -235,7 +400,7 @@ class Operator(object):
 
         def find_name(var_list, name):
             for var_name in var_list:
-                if var_name == name:
+                if var_list[var_name] is not None and var_name == name:
                     return True
             return False
 
@@ -255,7 +420,10 @@ class Operator(object):
                             % (in_proto.name, len(in_args)))
                     in_arg_names = []
                     for arg in in_args:
-                        in_arg_names.append(arg.name)
+                        if isinstance(arg, basestring):
+                            in_arg_names.append(arg)
+                        else:
+                            in_arg_names.append(arg.name)
                     self.desc.set_input(in_proto.name, in_arg_names)
                 else:
                     self.desc.set_input(in_proto.name, [])
@@ -296,19 +464,33 @@ class Operator(object):
                     continue
                 if isinstance(attrs[attr_name], Block):
                     self.desc.set_block_attr(attr_name, attrs[attr_name].desc)
+                elif isinstance(attrs[attr_name], core.BlockDesc) or \
+                   isinstance(attrs[attr_name], core.ProgramDesc):
+                    self.desc.set_serialized_attr(
+                        attr_name, attrs[attr_name].serialize_to_string())
                 else:
                     self.desc.set_attr(attr_name, attrs[attr_name])
 
         self.desc.check_attrs()
         no_kernel_op_set = {
             'feed', 'fetch', 'save', 'load', 'recurrent',
-            'rnn_memory_helper_grad', 'conditional_block', 'while'
+            'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
+            'recv', 'parallel_do'
         }
         if type not in no_kernel_op_set:
             self.desc.infer_var_type(self.block.desc)
             self.desc.infer_shape(self.block.desc)
 
     def to_string(self, throw_on_error):
+        """
+        To debug string.
+        Args:
+            throw_on_error(bool): raise exception when self is not initialized
+                when throw_on_error is True
+
+        Returns(str): The debug string.
+
+        """
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.OpDesc.FromString(str(protostr))
         return _debug_string_(proto, throw_on_error)
@@ -323,21 +505,55 @@ class Operator(object):
         return self.desc.type()
 
     def input(self, name):
+        """
+        Get input arguments by the input parameter name
+        Args:
+            name(str): The input parameter name
+
+        Returns(list): return the list of argument names associated with the
+            specific parameter name.
+
+        """
         return self.desc.input(name)
 
     @property
     def input_names(self):
+        """
+        Get all input parameter names
+        Returns(list): return a list of input parameter names
+
+        """
         return self.desc.input_names()
 
     def output(self, name):
+        """
+        Get output arguments by the output parameter name
+        Args:
+            name(str): The output parameter name
+
+        Returns(list): return the list of argument names associated with the
+            specific parameter name.
+
+        """
         return self.desc.output(name)
 
     @property
     def output_names(self):
+        """
+        Get all output parameter names
+        Returns(list): return a list of output parameter names
+
+        """
         return self.desc.output_names()
 
     @property
     def idx(self):
+        """
+        Return the array index of current operator.
+        Returns(int): The array index in block.ops array
+        Raises:
+            ValueError: when the operator is not found.
+        """
         for i, op in enumerate(self.block.ops):
             if op == self:
                 return i
@@ -345,19 +561,57 @@ class Operator(object):
             "Can't find op itself in it's block. It could be a bug of Paddle.")
 
     def has_attr(self, name):
+        """
+        operator has the attribute with name or not.
+        Args:
+            name(str): the attribute name
+
+        Returns(bool): True if has this attribute.
+
+        """
         return self.desc.has_attr(name)
 
     def attr_type(self, name):
+        """
+        Get the type of attribute by attribute name
+        Args:
+            name(str): the attribute name
+
+        Returns(core.AttrType): the attribute type
+
+        """
         return self.desc.attr_type(name)
 
     @property
     def attr_names(self):
+        """
+        Get all attribute names
+        Returns(list): The list of attribute name
+
+        """
         return self.desc.attr_names()
 
     def attr(self, name):
+        """
+        Get attribute by name
+        Args:
+            name(str): the attribute name
+
+        Returns(bool|int|str|float|list): The attribute value. The return value
+            can be any valid attribute type.
+
+        """
         return self.desc.attr(name)
 
     def block_attr(self, name):
+        """
+        Get the block attribute by name
+        Args:
+            name(str): the attribute name
+
+        Returns(int): the block index
+
+        """
         return self.desc.block_attr(name)
 
 
@@ -367,6 +621,7 @@ class Block(object):
         self.vars = dict()  # var_name --> var
         self.ops = collections.deque()  # operator list
         self.program = program
+        self.removed_vars = dict()
 
     def __str__(self):
         return self.to_string(True)
@@ -394,8 +649,23 @@ class Block(object):
             raise ValueError("var %s not in this block" % name)
         return v
 
+    def var_recursive(self, name):
+        if self.has_var(name):
+            return self.var(name)
+        else:
+            if self.idx == 0:
+                raise ValueError("var %s is not in block(%d) nor its parents." %
+                                 name, self.idx)
+            else:
+                parent_block = self.program.block(self.parent_idx)
+                return parent_block.var_recursive(name)
+
     def all_parameters(self):
-        return {v for k, v in self.vars.iteritems() if isinstance(v, Parameter)}
+        return list(self.iter_parameters())
+
+    def iter_parameters(self):
+        return (item[1] for item in self.vars.iteritems()
+                if isinstance(item[1], Parameter))
 
     def create_var(self, *args, **kwargs):
         var = Variable(self, *args, **kwargs)
@@ -419,6 +689,16 @@ class Block(object):
         self.ops.append(op)
         return op
 
+    def delete_ops(self, ops):
+        # remove from cpp
+        # FIXME(typhoonzero): remove only the first occuracy.
+        try:
+            start = list(self.ops).index(ops[0])
+            end = list(self.ops).index(ops[-1])
+        except Exception, e:
+            raise e
+        self.desc.remove_op(start, end + 1)
+
     def prepend_op(self, *args, **kwargs):
         op_desc = self.desc.prepend_op()
         op = Operator(self, op_desc, *args, **kwargs)
@@ -469,12 +749,46 @@ class Block(object):
         for index in range(len(self.ops)):
             assert self.ops[index].desc == ops_in_cpp[index]
 
+    def copy_param_info_from(self, other):
+        """
+        Copy the information of parameters from other block
+        Args:
+            other(Block): other block
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Block):
+            raise TypeError("copy_param_info_from should be invoked with Block")
+        for p in other.iter_parameters():
+            assert isinstance(p, Parameter)
+            v = self.vars.get(p.name, None)
+            if v is None:
+                raise ValueError("copy_param_info_from should be invoked with "
+                                 "same topology")
+            assert isinstance(v, Variable)
+            new_p = Parameter(
+                block=self,
+                shape=v.shape,
+                dtype=v.dtype,
+                type=v.type,
+                lod_level=v.lod_level,
+                stop_gradient=p.stop_gradient,
+                trainable=p.trainable,
+                optimize_attr=p.optimize_attr,
+                regularizer=p.regularizer,
+                clip_attr=p.clip_attr,
+                error_clip=p.error_clip,
+                name=v.name)
+            self.vars[new_p.name] = new_p
+
 
 class Program(object):
     def __init__(self):
         self.desc = core.ProgramDesc()
         self.blocks = [Block(self, 0)]
         self.current_block_idx = 0
+        self._seed = 0
 
     def __str__(self):
         return self.to_string(True)
@@ -484,11 +798,15 @@ class Program(object):
         proto = framework_pb2.ProgramDesc.FromString(str(protostr))
         return _debug_string_(proto, throw_on_error)
 
+    def get_desc(self):
+        return self.desc
+
     def clone(self):
         p = Program()
         p.desc = core.ProgramDesc(self.desc)
         p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
         p.sync_with_cpp()
+        p.copy_param_info_from(self)
         return p
 
     def prune(self, targets):
@@ -511,6 +829,13 @@ class Program(object):
         res.sync_with_cpp()
         return res
 
+    def inference_optimize(self):
+        res = Program()
+        res.desc = core.inference_optimize(self.desc)
+        res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
+        res.sync_with_cpp()
+        return res
+
     @staticmethod
     def parse_from_string(binary_str):
         p = Program()
@@ -519,6 +844,16 @@ class Program(object):
         p.sync_with_cpp()
         return p
 
+    @property
+    def random_seed(self):
+        return self._seed
+
+    @random_seed.setter
+    def random_seed(self, seed):
+        if not isinstance(seed, int):
+            raise ValueError("Seed must be a integer.")
+        self._seed = seed
+
     def __repr__(self):
         return str(self)
 
@@ -549,9 +884,11 @@ class Program(object):
         self.sync_with_cpp()
         return param_to_grad_info
 
-    def create_block(self):
+    def create_block(self, parent_idx=None):
         new_block_idx = len(self.blocks)
-        self.desc.append_block(self.current_block().desc)
+        parent = self.current_block() if parent_idx is None else self.block(
+            parent_idx)
+        self.desc.append_block(parent.desc)
         self.current_block_idx = new_block_idx
         self.blocks.append(Block(self, self.current_block_idx))
         return self.current_block()
@@ -565,6 +902,24 @@ class Program(object):
         for block in self.blocks:
             block.sync_with_cpp()
 
+    def copy_param_info_from(self, other):
+        """
+        Copy the information of parameters from other program.
+        Args:
+            other(Program): Other program
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Program):
+            raise TypeError("copy_param_info_from should be invoked with "
+                            "Program")
+
+        if len(self.blocks) != len(other.blocks):
+            raise ValueError("copy_param_info_from should be invoked with two "
+                             "program, with represent the same topology")
+        self.global_block().copy_param_info_from(other.global_block())
+
     def list_vars(self):
         for each_block in self.blocks:
             for each_var in each_block.vars.itervalues():
@@ -591,15 +946,92 @@ class Parameter(Variable):
 
         self.regularizer = kwargs.get('regularizer', None)
 
+        self.clip_attr = kwargs.get('clip_attr', None)
+
 
 # program is a global instance.
-g_main_program = Program()
-g_startup_program = Program()
+_main_program_ = Program()
+_startup_program_ = Program()
 
 
 def default_startup_program():
-    return g_startup_program
+    """
+    Get default startup program. In startup program, Paddle will initialize
+    parameters, initialize nccl handle, etc.
+
+    Returns:
+        Program: startup program
+    """
+    return _startup_program_
 
 
 def default_main_program():
-    return g_main_program
+    """
+    Get default main program. The main program is used for training or testing.
+
+    Returns:
+        Program: main program
+    """
+    return _main_program_
+
+
+def switch_main_program(program):
+    """
+    Switch the main program to a new program.
+
+    Args:
+        program(Program): The new main program
+
+    Returns:
+        Program: The previous main program
+    """
+    global _main_program_
+    prev_program = _main_program_
+    _main_program_ = program
+    return prev_program
+
+
+def switch_startup_program(program):
+    """
+    Switch the startup program to a new program
+    Args:
+        program(Program): The new startup program
+
+    Returns:
+        Program: The previous startup program
+    """
+    global _startup_program_
+    prev_program = _startup_program_
+    _startup_program_ = program
+    return prev_program
+
+
+@contextlib.contextmanager
+def program_guard(main_program, startup_program=None):
+    """
+    Switch program with `with` statement
+
+    Examples:
+        >>> with program_guard(Program()):
+        >>>   data = fluid.layers.data(...)
+        >>>   hidden = fluid.layers.fc(...)
+
+    Args:
+        main_program(Program): New main program inside `with` statement
+        startup_program(Program): New startup program inside `with` statement.
+            None means do not change startup program.
+
+    Returns:
+        None
+    """
+    if not isinstance(main_program, Program):
+        raise TypeError("main_program should be Program")
+    main_program = switch_main_program(main_program)
+    if startup_program is not None:
+        if not isinstance(startup_program, Program):
+            raise TypeError("startup_program should be Program")
+        startup_program = switch_startup_program(startup_program)
+    yield
+    switch_main_program(main_program)
+    if startup_program is not None:
+        switch_startup_program(startup_program)
diff --git a/python/paddle/v2/fluid/initializer.py b/python/paddle/v2/fluid/initializer.py
index ded144ecd5db83ce50ca0dc6243fdc52ac0b7a2f..2e8cfa3177ba00b48a1297b1deec7350588745f2 100644
--- a/python/paddle/v2/fluid/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
@@ -1,9 +1,24 @@
-import paddle.v2.fluid.framework as framework
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import framework
 import numpy as np
 
 __all__ = [
-    'ConstantInitializer', 'UniformInitializer', 'NormalInitializer',
-    'XavierInitializer'
+    'Constant',
+    'Uniform',
+    'Normal',
+    'Xavier',
 ]
 
 
@@ -93,7 +108,7 @@ class ConstantInitializer(Initializer):
             outputs={"Out": var},
             attrs={
                 "shape": var.shape,
-                "data_type": int(var.data_type),
+                "dtype": int(var.dtype),
                 "value": self._value
             })
         var.op = op
@@ -135,12 +150,14 @@ class UniformInitializer(Initializer):
         assert isinstance(var, framework.Variable)
         assert isinstance(block, framework.Block)
         # Initialization Ops should be prepended and not appended
+        if self._seed == 0:
+            self._seed = block.program.random_seed
         op = block.prepend_op(
             type="uniform_random",
             outputs={"Out": var},
             attrs={
                 "shape": var.shape,
-                "data_type": int(var.data_type),
+                "dtype": int(var.dtype),
                 "min": self._low,
                 "max": self._high,
                 "seed": self._seed
@@ -183,12 +200,14 @@ class NormalInitializer(Initializer):
         assert isinstance(var, framework.Variable)
         assert isinstance(block, framework.Block)
         # Initialization Ops should be prepended and not appended
+        if self._seed == 0:
+            self._seed = block.program.random_seed
         op = block.prepend_op(
             type="gaussian_random",
             outputs={"Out": var},
             attrs={
                 "shape": var.shape,
-                "data_type": int(var.data_type),
+                "dtype": int(var.dtype),
                 "mean": self._mean,
                 "std": self._std_dev,
                 "seed": self._seed
@@ -258,6 +277,9 @@ class XavierInitializer(Initializer):
         fan_in = f_in if self._fan_in is None else self._fan_in
         fan_out = f_out if self._fan_out is None else self._fan_out
 
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
         if self._uniform:
             limit = np.sqrt(6.0 / float(fan_in + fan_out))
             op = block.prepend_op(
@@ -265,7 +287,7 @@ class XavierInitializer(Initializer):
                 outputs={"Out": var},
                 attrs={
                     "shape": var.shape,
-                    "data_type": int(var.data_type),
+                    "dtype": int(var.dtype),
                     "min": -limit,
                     "max": limit,
                     "seed": self._seed
@@ -278,10 +300,112 @@ class XavierInitializer(Initializer):
                 outputs={"Out": var},
                 attrs={
                     "shape": var.shape,
-                    "data_type": int(var.data_type),
+                    "dtype": int(var.dtype),
                     "mean": 0.0,
                     "std": std,
                     "seed": self._seed
                 })
         var.op = op
         return op
+
+
+class MSRAInitializer(Initializer):
+    """Implements the MSRA initializer a.k.a. Kaiming Initializer
+
+    This class implements the weight initialization from the paper
+    Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    ImageNet Classification[1] by Kaiming He, Xiangyu Zhang, Shaoqing Ren
+    and Jian Sun. This is a robust initialization method that particularly
+    considers the rectifier nonlinearities. In case of Uniform distribution,
+    the range is [-x, x], where x = sqrt(6 / fan_in). In case of Normal
+    distribution, the mean is 0 and the standard deviation
+    is sqrt(2/ fan_in).
+
+    References:
+        [1] Delving Deep into Rectifiers: Surpassing Human-Level Performance
+            on ImageNet Classification
+            (https://arxiv.org/abs/1502.01852)
+    """
+
+    def __init__(self, uniform=True, fan_in=None, seed=0):
+        """Constructor for MSRAInitializer
+
+        Args:
+            uniform: whether to use uniform or normal distribution
+            fan_in: fan_in for MSRAInitializer. If None, it is
+                    inferred from the variable.
+            seed: random seed
+
+        Note: It is recommended to set fan_in to None for most cases.
+        """
+        assert uniform is not None
+        assert seed is not None
+        super(MSRAInitializer, self).__init__()
+        self._uniform = uniform
+        self._fan_in = fan_in
+        self._seed = seed
+
+    def __call__(self, var, block):
+        """Add MSRA initialization ops for a variable
+
+        Args:
+            var: Variable that needs to be initialized
+            block: The block in which initialization ops
+                   should be added
+
+        Returns:
+            the initialization op
+        """
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        f_in, f_out = self._compute_fans(var)
+
+        # If fan_in is passed, use it
+        fan_in = f_in if self._fan_in is None else self._fan_in
+
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        if self._uniform:
+            limit = np.sqrt(6.0 / float(fan_in))
+            op = block.prepend_op(
+                type="uniform_random",
+                outputs={"Out": var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": int(var.dtype),
+                    "min": -limit,
+                    "max": limit,
+                    "seed": self._seed
+                })
+
+        else:
+            std = np.sqrt(2.0 / float(fan_in))
+            op = block.prepend_op(
+                type="gaussian_random",
+                outputs={"Out": var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": int(var.dtype),
+                    "mean": 0.0,
+                    "std": std,
+                    "seed": self._seed
+                })
+        var.op = op
+        return op
+
+
+# We short the class name, since users will use the initializer with the package
+# name. The sample code:
+#
+# import paddle.fluid as fluid
+#
+# hidden = fluid.layers.fc(...,
+#                          param_attr=ParamAttr(fluid.initializer.Xavier()))
+#
+# It is no need to add an `Initializer` as the class suffix
+Constant = ConstantInitializer
+Uniform = UniformInitializer
+Normal = NormalInitializer
+Xavier = XavierInitializer
+MSRA = MSRAInitializer
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py
index 2d070814eef0b099ba71bef223596e30388ac48a..499df05e592855f63f41ec8ceb939edf0e4d435c 100644
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -1,16 +1,45 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import os
 import cPickle as pickle
 
-from paddle.v2.fluid.framework import Program, Parameter, g_main_program, \
-    Variable
+from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
 
 __all__ = [
-    'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
-    'load_persistables', "save_inference_model", "load_inference_model"
+    'save_vars',
+    'save_params',
+    'save_persistables',
+    'load_vars',
+    'load_params',
+    'load_persistables',
+    'save_inference_model',
+    'load_inference_model',
+    'get_inference_program',
 ]
 
 
 def is_parameter(var):
+    """Check whether the variable is a Parameter.
+
+    This function checks whether the input variable is a Parameter.
+
+    Args:
+        var : The input variable.
+
+    Returns:
+        boolean result whether the variable is a Parameter.
+    """
     return isinstance(var, Parameter)
 
 
@@ -23,7 +52,7 @@ def _clone_var_in_block_(block, var):
     return block.create_var(
         name=var.name,
         shape=var.shape,
-        dtype=var.data_type,
+        dtype=var.dtype,
         type=var.type,
         lod_level=var.lod_level,
         persistable=True)
@@ -36,7 +65,7 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     :param executor: executor that save variable
     :param dirname: directory path
     :param main_program: program. If vars is None, then filter all variables in this
-    program which fit `predicate`. Default g_program.
+    program which fit `predicate`. Default default_main_program.
     :param predicate: The Predicate describes a callable that returns a variable
     as a bool. If it returns true, the variables will be saved.
     :param vars: variables need to be saved. If specify vars, program & predicate
@@ -45,7 +74,7 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     """
     if vars is None:
         if main_program is None:
-            main_program = g_main_program
+            main_program = default_main_program()
         if not isinstance(main_program, Program):
             raise TypeError("program should be as Program type or None")
 
@@ -97,7 +126,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     :param executor: executor that save variable
     :param dirname: directory path
     :param main_program: program. If vars is None, then filter all variables in this
-    program which fit `predicate`. Default g_program.
+    program which fit `predicate`. Default default_main_program().
     :param predicate: The Predicate describes a callable that returns a variable
     as a bool. If it returns true, the variables will be loaded.
     :param vars: variables need to be loaded. If specify vars, program &
@@ -106,7 +135,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     """
     if vars is None:
         if main_program is None:
-            main_program = g_main_program
+            main_program = default_main_program()
         if not isinstance(main_program, Program):
             raise TypeError("program's type should be Program")
 
@@ -151,6 +180,17 @@ def load_persistables(executor, dirname, main_program=None):
         predicate=is_persistable)
 
 
+def get_inference_program(target_vars, main_program=None):
+    if main_program is None:
+        main_program = default_main_program()
+    if not isinstance(target_vars, list):
+        target_vars = [target_vars]
+
+    pruned_program = main_program.prune(targets=target_vars)
+    inference_program = pruned_program.inference_optimize()
+    return inference_program
+
+
 def save_inference_model(dirname,
                          feeded_var_names,
                          target_vars,
@@ -165,29 +205,47 @@ def save_inference_model(dirname,
     :param target_vars: Variables from which we can get inference results.
     :param executor: executor that save inference model
     :param main_program: original program, which will be pruned to build the inference model.
-    Default g_main_program.
+            Default default_main_program().
 
     :return: None
     """
-    if main_program is None:
-        main_program = g_main_program
-    if not isinstance(target_vars, list):
+    if isinstance(feeded_var_names, basestring):
+        feeded_var_names = [feeded_var_names]
+    else:
+        if not (bool(feeded_var_names) and all(
+                isinstance(name, basestring) for name in feeded_var_names)):
+            raise ValueError("'feed_var_names' should be a list of str.")
+
+    if isinstance(target_vars, Variable):
         target_vars = [target_vars]
+    else:
+        if not (bool(target_vars) and all(
+                isinstance(var, Variable) for var in target_vars)):
+            raise ValueError("'target_vars' should be a list of Variable.")
+
+    if main_program is None:
+        main_program = default_main_program()
 
     if not os.path.isdir(dirname):
         os.makedirs(dirname)
 
-    pruned_program = main_program.prune(target_vars)
+    pruned_program = main_program.prune(targets=target_vars)
+    inference_program = pruned_program.inference_optimize()
     fetch_var_names = [v.name for v in target_vars]
 
     model_file_name = dirname + "/__model__"
     with open(model_file_name, "w") as f:
         pickle.dump({
-            "program_desc_str": pruned_program.desc.serialize_to_string(),
+            "program_desc_str": inference_program.desc.serialize_to_string(),
             "feed_var_names": feeded_var_names,
             "fetch_var_names": fetch_var_names
         }, f, -1)
 
+    # Save only programDesc of inference_program in binary format
+    # in another file: __model__.dat
+    with open(model_file_name + ".dat", "wb") as fp:
+        fp.write(inference_program.desc.serialize_to_string())
+
     save_params(executor, dirname, main_program)
 
 
@@ -259,10 +317,10 @@ def get_parameter_value_by_name(name, executor, program=None):
     :param executor: executor for retrieving the value
     :param name: the name of the parameter
     :param program: the program where the variable is found
-    Default g_main_program.
+            Default default_main_program().
     :return: the LoDTensor for the variable
     """
     if program is None:
-        program = g_main_program
+        program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
index 5697eaa460cf21bf73add1b460947e4f3d4edfc3..191d2349b5e692f1f8ad9c068daf5592031433ad 100644
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -1,10 +1,23 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import copy
 import itertools
 
-from paddle.v2.fluid.framework import Variable, g_main_program, \
-    g_startup_program, unique_name, Program, dtype_is_floating
-from paddle.v2.fluid.initializer import ConstantInitializer, \
-    UniformInitializer, XavierInitializer
+from framework import Variable, Parameter, default_main_program, default_startup_program, \
+    unique_name, dtype_is_floating
+from paddle.v2.fluid.initializer import Constant, Xavier
+from param_attr import ParamAttr
 
 
 class LayerHelper(object):
@@ -21,19 +34,11 @@ class LayerHelper(object):
 
     @property
     def main_program(self):
-        prog = self.kwargs.get('main_program', None)
-        if prog is None:
-            return g_main_program
-        else:
-            return prog
+        return default_main_program()
 
     @property
     def startup_program(self):
-        prog = self.kwargs.get('startup_program', None)
-        if prog is None:
-            return g_startup_program
-        else:
-            return prog
+        return default_startup_program()
 
     def append_op(self, *args, **kwargs):
         return self.main_program.current_block().append_op(*args, **kwargs)
@@ -61,31 +66,15 @@ class LayerHelper(object):
 
     @property
     def param_attr(self):
-        default = {'name': None}
-        actual = self.kwargs.get('param_attr', None)
-        if actual is None:
-            actual = default
-        for default_field in default.keys():
-            if default_field not in actual:
-                actual[default_field] = default[default_field]
-        return actual
+        return ParamAttr.to_attr(self.kwargs.get('param_attr', None))
 
     @property
     def bias_attr(self):
-        default = {'name': None}
-        bias_attr = self.kwargs.get('bias_attr', None)
-        if bias_attr is None:
-            bias_attr = default
-
-        if isinstance(bias_attr, dict):
-            for default_field in default.keys():
-                if default_field not in bias_attr:
-                    bias_attr[default_field] = default[default_field]
-        return bias_attr
+        return ParamAttr.to_attr(self.kwargs.get('bias_attr', None))
 
     def multiple_param_attr(self, length):
         param_attr = self.param_attr
-        if isinstance(param_attr, dict):
+        if isinstance(param_attr, ParamAttr):
             param_attr = [param_attr]
 
         if len(param_attr) != 1 and len(param_attr) != length:
@@ -108,31 +97,48 @@ class LayerHelper(object):
         dtype = None
         for each in inputs:
             if dtype is None:
-                dtype = each.data_type
-            elif dtype != each.data_type:
+                dtype = each.dtype
+            elif dtype != each.dtype:
                 raise ValueError("Data Type mismatch")
         return dtype
 
-    def create_parameter(self, attr, shape, dtype, suffix='w',
-                         initializer=None):
+    def create_parameter(self,
+                         attr,
+                         shape,
+                         dtype,
+                         is_bias=False,
+                         default_initializer=None):
         # Deepcopy the attr so that parameters can be shared in program
-        attr_copy = copy.deepcopy(attr)
-        if initializer is not None:
-            attr_copy['initializer'] = initializer
+        assert isinstance(attr, ParamAttr)
+        suffix = 'b' if is_bias else 'w'
+
+        if default_initializer is None:
+            if is_bias:
+                attr.set_default_bias_initializer()
+            else:
+                attr.set_default_param_initializer()
         else:
-            attr_copy['initializer'] = self._get_default_initializer(dtype)
-        if attr_copy['name'] is None:
-            attr_copy['name'] = unique_name(".".join([self.name, suffix]))
+            attr.set_default_initializer(default_initializer)
+        if attr.name is None:
+            attr.name = unique_name(".".join([self.name, suffix]))
+
         self.startup_program.global_block().create_parameter(
-            dtype=dtype, shape=shape, **attr_copy)
+            dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True))
         return self.main_program.global_block().create_parameter(
-            name=attr_copy['name'], dtype=dtype, shape=shape)
+            dtype=dtype, shape=shape, **attr.to_kwargs())
+
+    def get_parameter(self, name):
+        param = self.main_program.global_block().var(name)
+        if not isinstance(param, Parameter):
+            raise ValueError("no Parameter name %s found" % name)
+        return param
 
-    def create_tmp_variable(self, dtype):
+    def create_tmp_variable(self, dtype, stop_gradient=False):
         return self.main_program.current_block().create_var(
             name=unique_name(".".join([self.name, 'tmp'])),
             dtype=dtype,
-            persistable=False)
+            persistable=False,
+            stop_gradient=stop_gradient)
 
     def create_variable(self, *args, **kwargs):
         return self.main_program.current_block().create_var(*args, **kwargs)
@@ -146,16 +152,12 @@ class LayerHelper(object):
         self.startup_program.global_block().create_var(
             name=var.name,
             type=var.type,
-            dtype=var.data_type,
+            dtype=var.dtype,
             shape=var.shape,
             persistable=True,
             initializer=initializer)
 
-    def append_bias_op(self,
-                       input_var,
-                       bias_initializer,
-                       dim_start=1,
-                       dim_end=None):
+    def append_bias_op(self, input_var, dim_start=1, dim_end=None):
         """
         Append bias operator and return its output. If the user does not set
         bias_attr, append_bias_op will return input_var
@@ -175,12 +177,8 @@ class LayerHelper(object):
             return input_var
 
         b = self.create_parameter(
-            attr=bias_attr,
-            shape=size,
-            dtype=input_var.data_type,
-            suffix='b',
-            initializer=bias_initializer)
-        tmp = self.create_tmp_variable(dtype=input_var.data_type)
+            attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
+        tmp = self.create_tmp_variable(dtype=input_var.dtype)
         self.append_op(
             type='elementwise_add',
             inputs={'X': [input_var],
@@ -195,18 +193,24 @@ class LayerHelper(object):
             return input_var
         if isinstance(act, basestring):
             act = {'type': act}
-        tmp = self.create_tmp_variable(dtype=input_var.data_type)
+        tmp = self.create_tmp_variable(dtype=input_var.dtype)
         act_type = act.pop('type')
         self.append_op(
             type=act_type,
             inputs={"X": [input_var]},
-            outputs={"Y": [tmp]},
+            outputs={"Out": [tmp]},
             attrs=act)
         return tmp
 
     def _get_default_initializer(self, dtype):
         if dtype is None or dtype_is_floating(dtype) is True:
-            return XavierInitializer()
+            return Xavier()
         else:
             # For integer and boolean types, initialize with all zeros
-            return ConstantInitializer()
+            return Constant()
+
+    def is_instance(self, param_name, cls):
+        param = self.kwargs.get(param_name, None)
+        if not isinstance(param, cls):
+            raise TypeError("The input {0} parameter of method {1} must be {2}",
+                            param_name, self.layer_type, cls.__name__)
diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py
deleted file mode 100644
index 26a10ae766c9c37b68951613c494029a6f162084..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/layers.py
+++ /dev/null
@@ -1,1739 +0,0 @@
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
-from paddle.v2.fluid.framework import OpProtoHolder, Variable, Program, \
-    Operator
-from paddle.v2.fluid.initializer import ConstantInitializer, \
-    NormalInitializer, XavierInitializer
-from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
-import re
-import cStringIO
-
-__all__ = [
-    'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
-    'StaticRNN', 'cast', 'sequence_conv', 'sequence_pool', 'sums', 'cos_sim',
-    'batch_norm', 'accuracy', 'split_lod_tensor'
-]
-
-
-def fc(input,
-       size,
-       param_attr=None,
-       param_initializer=None,
-       bias_attr=None,
-       bias_initializer=None,
-       name=None,
-       act=None,
-       num_flatten_dims=1,
-       main_program=None,
-       startup_program=None):
-    """
-    Fully Connected Layer.
-
-    Args:
-       input: The input tensor to the function
-       size: The size of the layer
-       param_attr: The parameters/weights to the FC Layer
-       param_initializer: Initializer used for the weight/parameter.
-       If None, XavierInitializer() is used
-       bias_attr: The bias parameter for the FC layer
-       bias_initializer: Initializer used for the bias.
-       If None, then ConstantInitializer() is used
-       name: Name/alias of the function
-       act: Activation to be applied to the output of FC layer
-       num_flatten_dims: Number of columns in input
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
-
-    This function can take in multiple inputs and performs the Fully Connected
-    function (linear transformation) on top of each of them.
-    So for input x, the output will be : Wx + b. Where W is the parameter,
-    b the bias and x is the input.
-
-    The function also applies an activation (non-linearity) on top of the
-    output, if activation is passed in the input.
-
-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
-
-    """
-
-    def _get_default_param_initializer():
-        return XavierInitializer()
-
-    def _get_default_bias_initializer():
-        return ConstantInitializer()
-
-    helper = LayerHelper('fc', **locals())
-
-    dtype = helper.input_dtype()
-
-    if param_initializer is None:
-        param_initializer = _get_default_param_initializer()
-
-    if bias_initializer is None:
-        bias_initializer = _get_default_bias_initializer()
-
-    mul_results = []
-    for input_var, param_attr in helper.iter_inputs_and_params():
-        input_shape = input_var.shape
-        param_shape = [
-            reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
-        ] + [size]
-        w = helper.create_parameter(
-            attr=param_attr,
-            initializer=param_initializer,
-            shape=param_shape,
-            dtype=dtype)
-        tmp = helper.create_tmp_variable(dtype)
-        helper.append_op(
-            type="mul",
-            inputs={
-                "X": input_var,
-                "Y": w,
-            },
-            outputs={"Out": tmp},
-            attrs={'x_num_col_dims': num_flatten_dims,
-                   'y_num_col_dims': 1})
-        mul_results.append(tmp)
-
-    # sum
-    if len(mul_results) == 1:
-        pre_bias = mul_results[0]
-    else:
-        pre_bias = helper.create_tmp_variable(dtype)
-        helper.append_op(
-            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
-    # add bias
-    pre_activation = helper.append_bias_op(pre_bias, bias_initializer)
-    # add activation
-    return helper.append_activation(pre_activation)
-
-
-def embedding(input,
-              size,
-              data_type='float32',
-              is_sparse=False,
-              param_attr=None,
-              main_program=None,
-              startup_program=None):
-    """
-    Embedding Layer.
-
-    Args:
-       input: The input to the function
-       size: The size of the layer
-       data_type: The type of data : float32, float_16, int etc
-       is_sparse: A flag that decleares whether the input is sparse
-       param_attr: Parameters for this layer
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
-
-    This function can take in the input (which is a vector of IDs) and
-    performs a lookup in the lookup_table using these IDs, to result into
-    the embedding of each ID in the input.
-
-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
-
-    """
-    helper = LayerHelper('embedding', **locals())
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=size, dtype=data_type)
-    tmp = helper.create_tmp_variable(data_type)
-    helper.append_op(
-        type='lookup_table',
-        inputs={'Ids': input,
-                'W': w},
-        outputs={'Out': tmp},
-        attrs={'is_sparse': is_sparse})
-    return tmp
-
-
-# TODO(qijun): expose H0 and C0
-def dynamic_lstm(input,
-                 size,
-                 data_type='float32',
-                 param_attr=None,
-                 bias_attr=None,
-                 use_peepholes=True,
-                 is_reverse=False,
-                 gate_activation='sigmoid',
-                 cell_activation='tanh',
-                 candidate_activation='tanh',
-                 main_program=None,
-                 startup_program=None):
-    helper = LayerHelper('lstm', **locals())
-    size = size / 4
-    weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, 4 * size], dtype=data_type)
-    bias_size = [1, 7 * size]
-    if not use_peepholes:
-        bias_size[1] = 4 * size
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=bias_size, dtype=data_type, suffix='b')
-
-    hidden = helper.create_tmp_variable(data_type)
-    cell = helper.create_tmp_variable(data_type)
-    batch_gate = helper.create_tmp_variable(data_type)
-    batch_cell_pre_act = helper.create_tmp_variable(data_type)
-
-    helper.append_op(
-        type='lstm',
-        inputs={'Input': input,
-                'Weight': weight,
-                'Bias': bias},
-        outputs={
-            'Hidden': hidden,
-            'Cell': cell,
-            'BatchGate': batch_gate,
-            'BatchCellPreAct': batch_cell_pre_act
-        },
-        attrs={
-            'use_peepholes': use_peepholes,
-            'is_reverse': is_reverse,
-            'gate_activation': gate_activation,
-            'cell_activation': cell_activation,
-            'candidate_activation': candidate_activation
-        })
-    return hidden, cell
-
-
-def data(name,
-         shape,
-         data_type='float32',
-         type=core.VarDesc.VarType.LOD_TENSOR,
-         append_batch_size=True,
-         main_program=None,
-         startup_program=None,
-         stop_gradient=True):
-    """
-    Data Layer.
-
-    Args:
-       name: The name/alias of the function
-       shape: Tuple declaring the shape.
-       data_type: The type of data : float32, float_16, int etc
-       type: The output type. By default it is LOD_TENSOR.
-       append_batch_size: Whether or not to append the data as a batch.
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
-       stop_gradient: A boolean that mentions whether gradient should flow.
-
-    This function takes in input and based on whether data has
-    to be returned back as a minibatch, it creates the global variable using
-    the helper functions. The global variables can be accessed by all the
-    following operations and layers in the graph.
-
-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
-
-    """
-    helper = LayerHelper('data', **locals())
-    shape = list(shape)
-    for i in xrange(len(shape)):
-        if shape[i] is None:
-            shape[i] = -1
-            append_batch_size = False
-        elif shape[i] < 0:
-            append_batch_size = False
-
-    if append_batch_size:
-        shape = [-1] + shape  # append batch size as -1
-
-    return helper.create_global_variable(
-        name=name,
-        shape=shape,
-        dtype=data_type,
-        type=type,
-        stop_gradient=stop_gradient)
-
-
-def create_tensor(dtype, name=None, main_program=None, startup_program=None):
-    helper = LayerHelper("create_tensor", **locals())
-    return helper.create_variable(name=helper.name, dtype=dtype)
-
-
-def _convert_(name):
-    """
-    Formatting.
-
-    Args:
-       name: The name/alias
-
-    This function takes in a name and converts it to a standard format of
-    group1_group2. Where as per the regular expression, group1 can have
-    alphabets and numbers and group2 has capital alphabets.
-
-    """
-    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
-    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
-
-
-def _generate_doc_string_(op_proto):
-    """
-    Generate docstring by OpProto
-
-    Args:
-        op_proto (framework_pb2.OpProto): a protobuf message typed OpProto
-
-    Returns:
-        str: the document string
-    """
-
-    def _type_to_str_(tp):
-        return framework_pb2.AttrType.Name(tp)
-
-    if not isinstance(op_proto, framework_pb2.OpProto):
-        raise TypeError("OpProto should be `framework_pb2.OpProto`")
-
-    buf = cStringIO.StringIO()
-    buf.write(op_proto.comment)
-    buf.write('\nArgs:\n')
-    for each_input in op_proto.inputs:
-        line_begin = '    {0}: '.format(_convert_(each_input.name))
-        buf.write(line_begin)
-        buf.write(each_input.comment)
-        buf.write('\n')
-        buf.write(' ' * len(line_begin))
-        buf.write('Duplicable: ')
-        buf.write(str(each_input.duplicable))
-        buf.write('  Optional: ')
-        buf.write(str(each_input.dispensable))
-        buf.write('\n')
-
-    for each_attr in op_proto.attrs:
-        buf.write('    ')
-        buf.write(each_attr.name)
-        buf.write(' (')
-        buf.write(_type_to_str_(each_attr.type))
-        buf.write('): ')
-        buf.write(each_attr.comment)
-        buf.write('\n')
-
-    if len(op_proto.outputs) != 0:
-        buf.write('\nReturns:\n')
-        buf.write('    ')
-        for each_opt in op_proto.outputs:
-            if not each_opt.intermediate:
-                break
-        buf.write(each_opt.comment)
-
-    return buf.getvalue()
-
-
-def _create_op_func_(op_type):
-    """
-    Create an Operator for a Function.
-
-    Args:
-       op_type: The name of the operator to be created
-
-    This function takes in the operator type (sigmoid, mean , average etc) and
-    creates the operator functionality.
-
-    """
-    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
-    not_intermediate_outputs = \
-        filter(lambda output: not output.intermediate, op_proto.outputs)
-    intermediate_outputs = \
-        filter(lambda output: output.intermediate, op_proto.outputs)
-
-    if len(not_intermediate_outputs) != 1:
-        raise ValueError("Only one non intermediate output operator can be",
-                         "automatically generated")
-
-    if not_intermediate_outputs[0].duplicable:
-        raise ValueError(
-            "Only non duplicable op can be automatically generated")
-
-    for output in intermediate_outputs:
-        if output.duplicable:
-            raise ValueError("The op can be automatically generated only when ",
-                             "all intermediate ops are not duplicable")
-
-    o_name = not_intermediate_outputs[0].name
-    intermediate_output_names = [output.name for output in intermediate_outputs]
-
-    def infer_and_check_data_type(op_proto, **kwargs):
-        """
-        This function performs the sanity check for data_type and
-        instance type.
-        """
-        dtype = None
-        for ipt in op_proto.inputs:
-            name = _convert_(ipt.name)
-            val = kwargs.pop(name, [])
-            if not isinstance(val, list) and not isinstance(val, tuple):
-                val = [val]
-            for each in val:
-                if not isinstance(each, Variable):
-                    raise ValueError("input of {0} must be variable".format(
-                        op_type))
-
-                if dtype is None:
-                    dtype = each.data_type
-                elif dtype != each.data_type:
-                    raise ValueError(
-                        "operator {0} must input same dtype".format(op_type))
-
-        return dtype
-
-    def func(**kwargs):
-        helper = LayerHelper(op_type, **kwargs)
-
-        dtype = infer_and_check_data_type(op_proto, **kwargs)
-
-        inputs = dict()
-        for ipt in op_proto.inputs:
-            name = _convert_(ipt.name)
-            val = kwargs.pop(name, [])
-            if not isinstance(val, list) and not isinstance(val, tuple):
-                val = [val]
-            inputs[ipt.name] = val
-
-        outputs = dict()
-        out = helper.create_tmp_variable(dtype=dtype)
-        outputs[o_name] = [out]
-        for name in intermediate_output_names:
-            outputs[name] = [helper.create_tmp_variable(dtype=dtype)]
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
-        return helper.append_activation(out)
-
-    func.__name__ = op_type
-    globals()[op_type] = func
-    func.__doc__ = _generate_doc_string_(op_proto)
-    global __all__
-    __all__.append(op_type)
-
-
-_create_op_func_('mean')
-_create_op_func_('mul')
-_create_op_func_('elementwise_add')
-_create_op_func_('dropout')
-_create_op_func_('reshape')
-_create_op_func_('sigmoid')
-_create_op_func_('scale')
-_create_op_func_('reshape')
-_create_op_func_('transpose')
-
-
-def cast(x, data_type, main_program=None):
-    """
-    This function takes in the input with input_data_type
-    and casts it to the output_data_type as the output.
-    """
-    helper = LayerHelper('cast', **locals())
-    out = helper.create_tmp_variable(dtype=data_type)
-    helper.append_op(
-        type='cast',
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={'in_data_type': x.data_type,
-               'out_data_type': out.data_type})
-    return out
-
-
-def concat(input, axis, main_program=None, startup_program=None):
-    """
-    This function concats the input along the axis mentioned
-    and returns that as the output.
-    """
-    helper = LayerHelper('concat', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
-    helper.append_op(
-        type='concat',
-        inputs={'X': input},
-        outputs={'Out': [out]},
-        attrs={'axis': axis})
-    return out
-
-
-def sums(input, main_program=None, startup_program=None):
-    """
-    This function takes in the input and performs the sum operation on it
-    and returns that as the output.
-    """
-    helper = LayerHelper('sum', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
-    helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
-    return out
-
-
-def assign(input, output, main_program=None, startup_program=None):
-    helper = LayerHelper('assign', **locals())
-    helper.append_op(
-        type='scale',
-        inputs={'X': [input]},
-        outputs={'Out': [output]},
-        attrs={'scale': 1.0})
-    return output
-
-
-def split_lod_tensor(input,
-                     mask,
-                     level=0,
-                     main_program=None,
-                     startup_program=None):
-    helper = LayerHelper('split_lod_tensor', **locals())
-    out_true = helper.create_tmp_variable(dtype=input.data_type)
-    out_false = helper.create_tmp_variable(dtype=input.data_type)
-    helper.append_op(
-        type='split_lod_tensor',
-        inputs={
-            'X': input,
-            'Mask': mask,
-        },
-        outputs={'OutTrue': out_true,
-                 'OutFalse': out_false},
-        attrs={'level': level})
-    return out_true, out_false
-
-
-def merge_lod_tensor(in_true,
-                     in_false,
-                     x,
-                     mask,
-                     level=0,
-                     main_program=None,
-                     startup_program=None):
-    helper = LayerHelper('merge_lod_tensor', **locals())
-    out = helper.create_tmp_variable(dtype=in_true.data_type)
-    helper.append_op(
-        type='merge_lod_tensor',
-        inputs={'X': x,
-                'Mask': mask,
-                'InTrue': in_true,
-                'InFalse': in_false},
-        outputs={'Out': out},
-        attrs={'level': level})
-    return out
-
-
-def cos_sim(X, Y, **kwargs):
-    """
-    This function performs the cosine similarity between two tensors
-    X and Y and returns that as the output.
-    """
-    helper = LayerHelper('cos_sim', **kwargs)
-    out = helper.create_tmp_variable(dtype=X.data_type)
-    xnorm = helper.create_tmp_variable(dtype=X.data_type)
-    ynorm = helper.create_tmp_variable(dtype=X.data_type)
-    helper.append_op(
-        type='cos_sim',
-        inputs={'X': [X],
-                'Y': [Y]},
-        outputs={'Out': [out],
-                 'XNorm': [xnorm],
-                 'YNorm': [ynorm]})
-    return out
-
-
-def cross_entropy(input, label, **kwargs):
-    """
-    This function computes cross_entropy using the input and label.
-    """
-    helper = LayerHelper('cross_entropy', **kwargs)
-    out = helper.create_tmp_variable(dtype=input.data_type)
-    helper.append_op(
-        type='cross_entropy',
-        inputs={'X': [input],
-                'Label': [label]},
-        outputs={'Y': [out]},
-        attrs=kwargs)
-    return out
-
-
-def square_error_cost(input, label, **kwargs):
-    """
-    This functions returns the squared error cost using the input and label.
-    The output is appending the op to do the above.
-    """
-    helper = LayerHelper('square_error_cost', **kwargs)
-    minus_out = helper.create_tmp_variable(dtype=input.data_type)
-    helper.append_op(
-        type='elementwise_sub',
-        inputs={'X': [input],
-                'Y': [label]},
-        outputs={'Out': [minus_out]})
-
-    square_out = helper.create_tmp_variable(dtype=input.data_type)
-    helper.append_op(
-        type='square', inputs={'X': [minus_out]}, outputs={'Y': [square_out]})
-    return square_out
-
-
-def accuracy(input, label, k=1, **kwargs):
-    """
-    This function computes the accuracy using the input and label.
-    The output is the top_k inputs and their indices.
-    """
-    helper = LayerHelper("accuracy", **kwargs)
-    topk_out = helper.create_tmp_variable(dtype=input.data_type)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": k})
-    acc_out_dtype = kwargs.get("out_dtype", "float32")
-    acc_out = helper.create_tmp_variable(dtype="float32")
-    correct = helper.create_tmp_variable(dtype="int64")
-    total = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="accuracy",
-        inputs={
-            "Out": [topk_out],
-            "Indices": [topk_indices],
-            "Label": [label]
-        },
-        outputs={
-            "Accuracy": [acc_out],
-            "Correct": [correct],
-            "Total": [total],
-        })
-    return acc_out
-
-
-def sequence_conv(input,
-                  num_filters,
-                  filter_size=3,
-                  filter_stride=1,
-                  act=None,
-                  padding=None,
-                  bias_attr=None,
-                  bias_initializer=None,
-                  param_attr=None,
-                  param_initializer=None,
-                  main_program=None,
-                  startup_program=None):
-    """
-    This function creates the op for sequence_conv, using the inputs and
-    other convolutional configurations for the filters and stride as given
-    in the input parameters to the function.
-    """
-
-    def _get_default_bias_initializer():
-        return ConstantInitializer()
-
-    def _get_default_param_initializer():
-        return XavierInitializer()
-
-    # FIXME(dzh) : want to unify the argument of python layer
-    # function. So we ignore some unecessary attributes.
-    # such as, padding_trainable, context_start.
-
-    helper = LayerHelper('sequence_conv', **locals())
-    dtype = helper.input_dtype()
-
-    if param_initializer is None:
-        param_initializer = _get_default_param_initializer()
-    if bias_initializer is None:
-        bias_initializer = _get_default_bias_initializer()
-
-    filter_shape = [filter_size * input.shape[1], num_filters]
-    filter = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=filter_shape,
-        dtype=dtype,
-        initializer=param_initializer)
-    pre_bias = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type='sequence_conv',
-        inputs={
-            'X': [input],
-            'Filter': [filter],
-        },
-        outputs={"Out": pre_bias},
-        attrs={
-            'contextStride': filter_stride,
-            'contextStart': -int(filter_size / 2),
-            'contextLength': filter_size
-        })
-    pre_act = helper.append_bias_op(pre_bias, bias_initializer)
-    return helper.append_activation(pre_act)
-
-
-def conv2d(input,
-           num_filters,
-           name=None,
-           filter_size=[1, 1],
-           act=None,
-           groups=None,
-           stride=[1, 1],
-           padding=None,
-           bias_attr=None,
-           bias_initializer=None,
-           param_attr=None,
-           param_initializer=None,
-           main_program=None,
-           startup_program=None):
-    """
-    This function creates the op for a 2-dimensional Convolution.
-    This is performed using the parameters of filters(size, dimensionality etc)
-    , stride and other configurations for a Convolution operation.
-    This funciton can also append an activation on top of the
-    conv-2d output, if mentioned in the input parameters.
-    """
-
-    def _get_default_bias_initializer():
-        return ConstantInitializer()
-
-    def _get_default_param_initializer(filter_size, num_channels):
-        std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
-        return NormalInitializer(0.0, std, 0)
-
-    helper = LayerHelper('conv2d', **locals())
-    dtype = helper.input_dtype()
-
-    num_channels = input.shape[1]
-    if groups is None:
-        num_filter_channels = num_channels
-    else:
-        if num_channels % groups != 0:
-            raise ValueError("num_channels must be divisible by groups.")
-        num_filter_channels = num_channels / groups
-
-    if isinstance(filter_size, int):
-        filter_size = [filter_size, filter_size]
-    if isinstance(stride, int):
-        stride = [stride, stride]
-    if isinstance(padding, int):
-        padding = [padding, padding]
-
-    input_shape = input.shape
-    filter_shape = [num_filters, num_filter_channels] + filter_size
-
-    if param_initializer is None:
-        param_initializer = _get_default_param_initializer(filter_size,
-                                                           num_channels)
-    if bias_initializer is None:
-        bias_initializer = _get_default_bias_initializer()
-
-    filter = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=filter_shape,
-        dtype=dtype,
-        initializer=param_initializer)
-    pre_bias = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type='conv2d',
-        inputs={
-            'Input': input,
-            'Filter': filter,
-        },
-        outputs={"Output": pre_bias},
-        attrs={'strides': stride,
-               'paddings': padding,
-               'groups': groups})
-
-    pre_act = helper.append_bias_op(
-        pre_bias, bias_initializer, dim_start=1, dim_end=2)
-
-    return helper.append_activation(pre_act)
-
-
-def sequence_pool(input, pool_type, **kwargs):
-    """
-    This function add the operator for sequence pooling.
-    This is applied on top of the input using pool_type mentioned
-    in the parameters.
-    """
-    helper = LayerHelper('sequence_pool', input=input, **kwargs)
-    dtype = helper.input_dtype()
-    pool_out = helper.create_tmp_variable(dtype)
-    max_index = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type="sequence_pool",
-        inputs={"X": input},
-        outputs={"Out": pool_out,
-                 "MaxIndex": max_index},
-        attrs={"pooltype": pool_type.upper()})
-
-    return pool_out
-
-
-def pool2d(input,
-           pool_size,
-           pool_type,
-           pool_stride=[1, 1],
-           pool_padding=[0, 0],
-           global_pooling=False,
-           main_program=None,
-           startup_program=None):
-    """
-    This function adds the operator for pooling in 2 dimensions, using the
-    pooling configurations mentioned in input parameters.
-    """
-    if pool_type not in ["max", "avg"]:
-        raise ValueError(
-            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
-            str(pool_type))
-    if isinstance(pool_size, int):
-        pool_size = [pool_size, pool_size]
-    if isinstance(pool_stride, int):
-        pool_stride = [pool_stride, pool_stride]
-    if isinstance(pool_padding, int):
-        pool_padding = [pool_padding, pool_padding]
-
-    helper = LayerHelper('pool2d', **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type="pool2d",
-        inputs={"X": input},
-        outputs={"Out": pool_out},
-        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "global_pooling": global_pooling,
-            "strides": pool_stride,
-            "paddings": pool_padding
-        })
-
-    return pool_out
-
-
-def batch_norm(input,
-               act=None,
-               is_test=False,
-               momentum=0.9,
-               epsilon=1e-05,
-               param_attr=None,
-               bias_attr=None,
-               data_layout='NCHW',
-               main_program=None,
-               startup_program=None):
-    """
-    This function helps create an operator to implement
-    the BatchNorm layer using the configurations from the input parameters.
-    """
-    helper = LayerHelper('batch_norm', **locals())
-    dtype = helper.input_dtype()
-
-    input_shape = input.shape
-    if data_layout == 'NCHW':
-        channel_num = input_shape[1]
-    else:
-        if data_layout == 'NHWC':
-            channel_num = input_shape[-1]
-        else:
-            raise ValueError("unsupported data layout:" + data_layout)
-
-    param_shape = [channel_num]
-
-    # create parameter
-    scale = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype=dtype,
-        initializer=ConstantInitializer(1.0))
-    bias = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype=dtype,
-        initializer=ConstantInitializer(0.0))
-
-    mean = helper.create_global_variable(
-        dtype=input.data_type, shape=param_shape, persistable=True)
-    helper.set_variable_initializer(
-        var=mean, initializer=ConstantInitializer(0.0))
-
-    variance = helper.create_global_variable(
-        dtype=input.data_type, shape=param_shape, persistable=True)
-    helper.set_variable_initializer(
-        var=variance, initializer=ConstantInitializer(1.0))
-
-    # create output
-    # mean and mean_out share the same memory
-    mean_out = mean
-    # variance and variance out share the same memory
-    variance_out = variance
-    saved_mean = helper.create_tmp_variable(dtype)
-    saved_variance = helper.create_tmp_variable(dtype)
-
-    batch_norm_out = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type="batch_norm",
-        inputs={
-            "X": input,
-            "Scale": scale,
-            "Bias": bias,
-            "Mean": mean,
-            "Variance": variance
-        },
-        outputs={
-            "Y": batch_norm_out,
-            "MeanOut": mean_out,
-            "VarianceOut": variance_out,
-            "SavedMean": saved_mean,
-            "SavedVariance": saved_variance
-        },
-        attrs={"momentum": momentum,
-               "epsilon": epsilon,
-               "is_test": is_test})
-
-    return helper.append_activation(batch_norm_out)
-
-
-def beam_search_decode(ids, scores, main_program=None, startup_program=None):
-    helper = LayerHelper('beam_search_decode', **locals())
-    sentence_ids = helper.create_tmp_variable(dtype=ids.data_type)
-    sentence_scores = helper.create_tmp_variable(dtype=ids.data_type)
-
-    helper.append_op(
-        type="beam_search_decode",
-        inputs={"Ids": ids,
-                "Scores": scores},
-        outputs={
-            "SentenceIds": sentence_ids,
-            "SentenceScores": sentence_scores
-        })
-
-    return sentence_ids, sentence_scores
-
-
-class BlockGuard(object):
-    """
-    BlockGuard class.
-
-    BlockGuard class is used to create a sub-block in a program by
-    using the Python `with` keyword.
-    """
-
-    def __init__(self, main_program):
-        if not isinstance(main_program, Program):
-            raise TypeError("BlockGuard takes a program")
-        self.main_program = main_program
-
-    def __enter__(self):
-        self.main_program.create_block()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.main_program.rollback()
-        if exc_type is not None:
-            return False  # re-raise exception
-        return True
-
-
-class StaticRNNGuard(BlockGuard):
-    """
-    StaticRNNGuard class.
-
-    StaticRNNGuard class is used to create a StaticRNN block in a program.
-    """
-
-    def __init__(self, rnn):
-        if not isinstance(rnn, StaticRNN):
-            raise TypeError("StaticRNNGuard takes a StaticRNN")
-        super(StaticRNNGuard, self).__init__(rnn.helper.main_program)
-        self.rnn = rnn
-
-    def __enter__(self):
-        self.rnn.status = StaticRNN.IN_RNN_BLOCK
-        return super(StaticRNNGuard, self).__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-        self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
-        self.rnn.complete_rnn_op()
-        return super(StaticRNNGuard, self).__exit__(exc_type, exc_val, exc_tb)
-
-
-class StaticRNNMemoryLink(object):
-    """
-    StaticRNNMemoryLink class.
-
-    Args:
-        init: the initial variable for Memory
-        init: Variable
-        pre_mem: the memory variable in previous time step
-        pre_mem: Variable
-        mem: the memory variable in current time step
-        mem: Variable
-
-    StaticRNNMemoryLink class is used to create a link between two
-    memory cells of a StaticRNN.
-    """
-
-    def __init__(self, init, pre_mem, mem=None):
-        self.init = init
-        self.pre_mem = pre_mem
-        self.mem = mem
-
-
-class StaticRNN(object):
-    """
-    StaticRNN class.
-
-    StaticRNN class is used to create a StaticRNN. The RNN will have its
-    own parameters like inputs, outputs, memories, status and length.
-    """
-    BEFORE_RNN_BLOCK = 0
-    IN_RNN_BLOCK = 1
-    AFTER_RNN_BLOCK = 2
-
-    def __init__(self, name=None, main_program=None):
-        self.helper = LayerHelper(
-            "static_rnn", name=name, main_program=main_program)
-        self.memories = {}  # memory map, from pre_mem.name --> MemoryLink
-        self.inputs = []  # input variable list in current block
-        self.outputs = []  # output variable list in parent block
-        self.status = StaticRNN.BEFORE_RNN_BLOCK  # status flag.
-        # sequence length, since it is a static RNN, sequence length are fixed.
-        self.seq_len = None
-
-    def step(self):
-        return StaticRNNGuard(self)
-
-    def _assert_in_rnn_block_(self, method):
-        if self.status != StaticRNN.IN_RNN_BLOCK:
-            raise ValueError("You must invoke {0} in rnn block".format(method))
-
-    def memory(self,
-               init=None,
-               shape=None,
-               batch_ref=None,
-               init_value=0.0,
-               init_batch_dim_idx=0,
-               ref_batch_dim_idx=1):
-        """
-        Args:
-            init: boot memory, if not set, a shape, batch_ref must be provided
-            shape: shape of the boot memory
-            batch_ref: batch size reference variable
-            init_value: the init value of boot memory
-            init_batch_dim_idx: the index of batch size in init's dimension
-            ref_batch_dim_idx: the index of batch size in batch_ref's dimension
-        """
-        self._assert_in_rnn_block_('memory')
-        if init is None:
-            if shape is None or batch_ref is None:
-                raise ValueError(
-                    "if init is None, memory at least need shape and batch_ref")
-            parent_block = self.parent_block()
-            var_name = unique_name("@".join([self.helper.name, "memory_boot"]))
-            boot_var = parent_block.create_var(
-                name=var_name,
-                shape=shape,
-                dtype=batch_ref.data_type,
-                persistable=False)
-
-            parent_block.append_op(
-                type="fill_constant_batch_size_like",
-                inputs={'Input': [batch_ref]},
-                outputs={'Out': [boot_var]},
-                attrs={
-                    'value': init_value,
-                    'shape': boot_var.shape,
-                    'data_type': boot_var.data_type,
-                    'input_dim_idx': ref_batch_dim_idx,
-                    'output_dim_idx': init_batch_dim_idx
-                })
-
-            return self.memory(init=boot_var)
-        else:
-            pre_mem = self.helper.create_variable(
-                name=unique_name("@".join([self.helper.name, "mem"])),
-                dtype=init.data_type,
-                shape=init.shape)
-            self.memories[pre_mem.name] = StaticRNNMemoryLink(
-                init=init, pre_mem=pre_mem)
-            return pre_mem
-
-    def step_input(self, x):
-        self._assert_in_rnn_block_('step_input')
-        if not isinstance(x, Variable):
-            raise TypeError("step input takes a Variable")
-        if self.seq_len is None:
-            self.seq_len = x.shape[0]
-        elif self.seq_len != x.shape[0]:
-            raise ValueError("Static RNN only take fix seq_len input")
-
-        ipt = self.helper.create_variable(
-            name=x.name,
-            dtype=x.data_type,
-            shape=list(x.shape[1:]),
-            type=x.type)
-        self.inputs.append(ipt)
-        return ipt
-
-    def step_output(self, o):
-        self._assert_in_rnn_block_('step_output')
-        if not isinstance(o, Variable):
-            raise TypeError("step output takes a Variable")
-
-        tmp_o = self.helper.create_tmp_variable(dtype=o.data_type)
-        self.helper.append_op(
-            type='rnn_memory_helper',
-            inputs={'X': [o]},
-            outputs={'Out': tmp_o},
-            attrs={'data_type': o.data_type})
-
-        out_var = self.parent_block().create_var(
-            name=tmp_o.name,
-            shape=[self.seq_len] + list(tmp_o.shape),
-            dtype=tmp_o.data_type)
-
-        self.outputs.append(out_var)
-
-    def output(self, *outputs):
-        for each in outputs:
-            self.step_output(each)
-
-    def update_memory(self, mem, var):
-        if not isinstance(mem, Variable) or not isinstance(var, Variable):
-            raise TypeError("update memory should take variables")
-        self.memories[mem.name].mem = var
-
-    def parent_block(self):
-        prog = self.helper.main_program
-        parent_idx = prog.current_block().parent_idx
-        assert parent_idx >= 0
-        parent_block = prog.block(parent_idx)
-        return parent_block
-
-    def __call__(self, *args, **kwargs):
-        if self.status != StaticRNN.AFTER_RNN_BLOCK:
-            raise ValueError("RNN output can only be retrieved after rnn block")
-        if len(self.outputs) == 0:
-            raise ValueError("RNN has no output")
-        elif len(self.outputs) == 1:
-            return self.outputs[0]
-        else:
-            return self.outputs
-
-    def complete_rnn_op(self):
-        main_program = self.helper.main_program
-        rnn_block = main_program.current_block()
-        parent_block = self.parent_block()
-
-        local_inputs = set()
-
-        for op in rnn_block.ops:
-            assert isinstance(op, Operator)
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    local_inputs.add(out_var_name)
-
-        for var in self.inputs:
-            local_inputs.add(var.name)
-        for m in self.memories:
-            local_inputs.add(m)
-
-        params = list()
-        for op in rnn_block.ops:
-            assert isinstance(op, Operator)
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in local_inputs:
-                        params.append(in_var_name)
-
-        parameters = [parent_block.var(name) for name in params]
-
-        step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-
-        inlinks = [parent_block.var(i.name) for i in self.inputs]
-        outlinks = self.outputs
-
-        boot_memories = []
-        pre_memories = []
-        memories = []
-        for _, mem in self.memories.iteritems():
-            boot_memories.append(mem.init)
-            pre_memories.append(mem.pre_mem.name)
-            mem_var = rnn_block.var(mem.mem.name)
-            assert isinstance(mem_var, Variable)
-            new_mem = self.helper.create_tmp_variable(dtype=mem_var.data_type)
-
-            rnn_block.append_op(
-                type='rnn_memory_helper',
-                inputs={'X': [mem_var]},
-                outputs={'Out': [new_mem]},
-                attrs={'data_type': mem_var.data_type})
-
-            memories.append(new_mem.name)
-
-        parent_block.append_op(
-            type='recurrent',
-            inputs={
-                'inputs': inlinks,
-                'initial_states': boot_memories,
-                'parameters': parameters
-            },
-            outputs={'outputs': outlinks,
-                     'step_scopes': [step_scope]},
-            attrs={
-                'ex_states': pre_memories,
-                'states': memories,
-                'step_block': rnn_block
-            })
-
-
-class WhileGuard(BlockGuard):
-    def __init__(self, while_op):
-        if not isinstance(while_op, While):
-            raise TypeError("WhileGuard takes a while op")
-        super(WhileGuard, self).__init__(while_op.helper.main_program)
-        self.while_op = while_op
-
-    def __enter__(self):
-        self.while_op.status = While.IN_WHILE_BLOCK
-        return super(WhileGuard, self).__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-        self.while_op.status = While.AFTER_WHILE_BLOCK
-        self.while_op.complete()
-        return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb)
-
-
-class While(object):
-    BEFORE_WHILE_BLOCK = 0
-    IN_WHILE_BLOCK = 1
-    AFTER_WHILE_BLOCK = 2
-
-    def __init__(self, cond, name=None, main_program=None):
-        self.helper = LayerHelper("while", name=name, main_program=main_program)
-        self.status = While.BEFORE_WHILE_BLOCK
-        if not isinstance(cond, Variable):
-            raise TypeError("condition should be a variable")
-        assert isinstance(cond, Variable)
-        if cond.data_type != core.DataType.BOOL:
-            raise TypeError("condition should be a bool variable")
-        if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
-            raise TypeError("condition should be a bool scalar")
-        self.cond_var = cond
-
-    def block(self):
-        return WhileGuard(self)
-
-    def complete(self):
-        main_program = self.helper.main_program
-        while_block = main_program.current_block()
-        parent_block = main_program.block(main_program.current_block()
-                                          .parent_idx)
-
-        inner_outputs = {self.cond_var.name}
-        x_name_list = set()
-        for op in while_block.ops:
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in inner_outputs:
-                        x_name_list.add(in_var_name)
-
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    inner_outputs.add(out_var_name)
-
-        out_vars = []
-        for inner_out_name in inner_outputs:
-            if inner_out_name in parent_block.vars:
-                out_vars.append(parent_block.var(inner_out_name))
-
-        step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-
-        parent_block.append_op(
-            type='while',
-            inputs={
-                'X': [parent_block.var(x_name) for x_name in x_name_list],
-                'Condition': [self.cond_var]
-            },
-            outputs={'Out': out_vars,
-                     'StepScopes': [step_scope]},
-            attrs={'step_block': while_block})
-
-
-def lstm(x,
-         c_pre_init,
-         hidden_dim,
-         forget_bias=None,
-         main_program=None,
-         startup_program=None):
-    """
-    This function helps create an operator for the LSTM (Long Short Term
-    Memory) cell that can be used inside an RNN.
-    """
-    helper = LayerHelper('lstm_unit', **locals())
-    rnn = StaticRNN()
-    with rnn.step():
-        c_pre = rnn.memory(init=c_pre_init)
-        x_t = rnn.step_input(x)
-
-        before_fc = concat(
-            input=[x_t, c_pre],
-            axis=1,
-            main_program=main_program,
-            startup_program=startup_program)
-        after_fc = fc(input=before_fc,
-                      size=hidden_dim * 4,
-                      main_program=main_program,
-                      startup_program=startup_program)
-
-        data_type = x.data_type
-        c = helper.create_tmp_variable(data_type)
-        h = helper.create_tmp_variable(data_type)
-
-        helper.append_op(
-            type='lstm_unit',
-            inputs={"X": after_fc,
-                    "C_prev": c_pre},
-            outputs={"C": c,
-                     "H": h},
-            attrs={"forget_bias": forget_bias})
-
-        rnn.update_memory(c_pre, c)
-        rnn.output(h)
-
-    return rnn()
-
-
-def lod_rank_table(x, level=0, main_program=None):
-    """
-    This function creates an operator for creating a LOD_RANK_TABLE
-    using the input x.
-    """
-    helper = LayerHelper("lod_rank_table", **locals())
-    table = helper.create_variable(
-        type=core.VarDesc.VarType.LOD_RANK_TABLE,
-        name=unique_name("lod_rank_table"))
-    helper.append_op(
-        type='lod_rank_table',
-        inputs={'X': x},
-        outputs={'Out': table},
-        attrs={'level': level})
-    return table
-
-
-def lod_tensor_to_array(x, table, main_program=None):
-    """
-    This function creates an operator to convert an LOD_Tensor to
-    an array.
-    """
-    helper = LayerHelper("lod_tensor_to_array", **locals())
-    array = helper.create_variable(
-        name=unique_name("lod_tensor_to_array"),
-        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-        dtype=x.data_type)
-    helper.append_op(
-        type='lod_tensor_to_array',
-        inputs={'X': x,
-                'RankTable': table},
-        outputs={'Out': array})
-    return array
-
-
-def array_to_lod_tensor(x, table, main_program=None):
-    """
-    This function creates an operator to convert an array to a
-    LOD_Tensor.
-    """
-    helper = LayerHelper("array_to_lod_tensor", **locals())
-    tmp = helper.create_tmp_variable(dtype=x.data_type)
-    helper.append_op(
-        type="array_to_lod_tensor",
-        inputs={'X': x,
-                'RankTable': table},
-        outputs={'Out': tmp})
-    return tmp
-
-
-def fill_constant(shape, dtype, value, main_program=None, startup_program=None):
-    """
-    This function creates a tensor , with shape as mentioned in the input and
-    specified data_type and fills this up with a constant value that
-    comes in the input. It also sets the stop_gradient to be True.
-    """
-    helper = LayerHelper("fill_constant", **locals())
-    out = helper.create_tmp_variable(dtype=dtype)
-    helper.append_op(
-        type='fill_constant',
-        inputs={},
-        outputs={'Out': [out]},
-        attrs={
-            'shape': shape,
-            'data_type': out.data_type,
-            'value': float(value)
-        })
-    out.stop_gradient = True
-    return out
-
-
-def fill_constant_batch_size_like(input,
-                                  shape,
-                                  dtype,
-                                  value,
-                                  input_dim_idx=0,
-                                  output_dim_idx=0,
-                                  main_program=None,
-                                  startup_program=None):
-    helper = LayerHelper("fill_constant_batch_size_like", **locals())
-    out = helper.create_tmp_variable(dtype=dtype)
-    helper.append_op(
-        type='fill_constant_batch_size_like',
-        inputs={'Input': input},
-        outputs={'Out': [out]},
-        attrs={
-            'shape': shape,
-            'data_type': out.data_type,
-            'value': float(value),
-            'input_dim_idx': input_dim_idx,
-            'output_dim_idx': output_dim_idx
-        })
-    out.stop_gradient = True
-    return out
-
-
-def ones(shape, dtype, main_program=None):
-    """
-    This function performs the same function as fill_constant() declared above
-    with the constant value being 1.0.
-    """
-    return fill_constant(value=1.0, **locals())
-
-
-def zeros(shape, dtype, main_program=None):
-    """
-    This function performs the same function as fill_constant() declared above
-    with the constant value being 0.0.
-    """
-    return fill_constant(value=0.0, **locals())
-
-
-def increment(x, value=1.0, in_place=True, main_program=None):
-    """
-    This function creates an operator to increment each value in the input
-    `x` by an amount: `value` as mentioned in the input parameter. This
-    operation is performed in-place by default.
-    """
-    helper = LayerHelper("increment", **locals())
-    if not in_place:
-        out = helper.create_tmp_variable(dtype=x.data_type)
-    else:
-        out = x
-    helper.append_op(
-        type='increment',
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={'step': value})
-    return out
-
-
-def array_write(x, i, array=None, main_program=None):
-    """
-    This function creates an operator to write the data out as a
-    LOD_TENSOR_ARRAY.
-    """
-    helper = LayerHelper('array_write', **locals())
-    if array is None:
-        array = helper.create_variable(
-            name="{0}.out".format(helper.name),
-            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-            dtype=x.data_type)
-    helper.append_op(
-        type='write_to_array',
-        inputs={'X': [x],
-                'I': [i]},
-        outputs={'Out': [array]})
-    return array
-
-
-def create_array(dtype, main_program=None):
-    helper = LayerHelper("array", **locals())
-    return helper.create_variable(
-        name="{0}.out".format(helper.name),
-        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-        dtype=dtype)
-
-
-def less_than(x, y, cond=None, main_program=None, **ignored):
-    helper = LayerHelper("less_than", **locals())
-    if cond is None:
-        cond = helper.create_tmp_variable(dtype='bool')
-        cond.stop_gradient = True
-
-    helper.append_op(
-        type='less_than', inputs={'X': [x],
-                                  'Y': [y]}, outputs={'Out': [cond]})
-    return cond
-
-
-def array_read(array, i, main_program=None):
-    """
-    This function creates an operator to read the data in as a
-    LOD_TENSOR_ARRAY.
-    """
-    helper = LayerHelper('array_read', **locals())
-    if not isinstance(
-            array,
-            Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
-        raise TypeError("array should be tensor array vairable")
-    out = helper.create_tmp_variable(dtype=array.data_type)
-    helper.append_op(
-        type='read_from_array',
-        inputs={'X': [array],
-                'I': [i]},
-        outputs={'Out': [out]})
-    return out
-
-
-def shrink_memory(x, i, table, main_program=None):
-    """
-    This function creates an operator to shrink_rnn_memory using the RankTable
-    as mentioned in the input parameter.
-    """
-    helper = LayerHelper('shrink_memory', **locals())
-    out = helper.create_tmp_variable(dtype=x.data_type)
-    helper.append_op(
-        type='shrink_rnn_memory',
-        inputs={'X': [x],
-                'I': [i],
-                'RankTable': [table]},
-        outputs={'Out': [out]},
-        attrs={})
-    return out
-
-
-def array_length(array, main_program=None):
-    """
-    This function creates an operator to find the length of the
-    LOD_TENSOR_ARRAY.
-    """
-    helper = LayerHelper('array_length', **locals())
-    tmp = helper.create_tmp_variable(dtype='int64')
-    tmp.stop_gradient = True
-    helper.append_op(
-        type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
-    return tmp
-
-
-class ConditionalBlockGuard(BlockGuard):
-    def __init__(self, block):
-        if not isinstance(block, ConditionalBlock):
-            raise TypeError("block should be conditional block")
-        super(ConditionalBlockGuard, self).__init__(block.helper.main_program)
-        self.block = block
-
-    def __enter__(self):
-        return super(ConditionalBlockGuard, self).__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.block.complete()
-        return super(ConditionalBlockGuard, self).__exit__(exc_type, exc_val,
-                                                           exc_tb)
-
-
-class ConditionalBlock(object):
-    def __init__(self,
-                 inputs,
-                 name=None,
-                 main_program=None,
-                 startup_program=None):
-        for each_input in inputs:
-            if not isinstance(each_input, Variable):
-                raise TypeError("Each input should be variable")
-        self.inputs = inputs
-        self.helper = LayerHelper(
-            'conditional_block',
-            name=name,
-            main_program=main_program,
-            startup_program=startup_program)
-
-    def block(self):
-        return ConditionalBlockGuard(self)
-
-    def complete(self):
-        inside_block = self.helper.main_program.current_block()
-        parent_block = self.helper.main_program.block(inside_block.parent_idx)
-
-        intermediate = set()
-        params = set()
-
-        for each_op in inside_block.ops:
-            assert isinstance(each_op, Operator)
-            for iname in each_op.input_names:
-                for in_var_name in each_op.input(iname):
-                    if in_var_name not in intermediate:
-                        params.add(in_var_name)
-
-            for oname in each_op.output_names:
-                for out_var_name in each_op.output(oname):
-                    intermediate.add(out_var_name)
-        input_set = set([ipt.name for ipt in self.inputs])
-
-        param_list = [
-            parent_block.var(each_name) for each_name in params
-            if each_name not in input_set
-        ]
-
-        out_list = [
-            parent_block.var(var_name) for var_name in parent_block.vars
-            if var_name not in intermediate
-        ]
-
-        step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-        parent_block.append_op(
-            type='conditional_block',
-            inputs={
-                'X': self.inputs,
-                'Params': param_list,
-            },
-            outputs={'Out': out_list,
-                     'Scope': [step_scope]},
-            attrs={'block': inside_block})
-
-
-class IfElseBlockGuard(object):
-    def __init__(self, is_true, ifelse):
-        if not isinstance(ifelse, IfElse):
-            raise TypeError("ifelse must be an instance of IfElse class")
-
-        if ifelse.status != IfElse.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("You cannot invoke IfElse.block() inside a block")
-
-        self.is_true = is_true
-        self.ie = ifelse
-        if is_true:
-            self.cond_block = ifelse.conditional_true_block
-        else:
-            self.cond_block = ifelse.conditional_false_block
-
-        if not isinstance(self.cond_block, ConditionalBlock):
-            raise TypeError("Unexpected situation")
-
-        self.cond_block = self.cond_block.block()
-
-    def __enter__(self):
-        self.ie.status = IfElse.IN_IF_ELSE_TRUE_BLOCKS if self.is_true else IfElse.IN_IF_ELSE_FALSE_BLOCKS
-        self.cond_block.__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if not self.cond_block.__exit__(exc_type, exc_val, exc_tb):
-            # re-raise inside exception
-            return False
-        if len(self.ie.output_table[1 if self.is_true else 0]) == 0:
-            raise ValueError("Must set output inside block")
-        self.ie.status = IfElse.OUT_IF_ELSE_BLOCKS
-
-
-class IfElse(object):
-    OUT_IF_ELSE_BLOCKS = 0
-    IN_IF_ELSE_TRUE_BLOCKS = 1
-    IN_IF_ELSE_FALSE_BLOCKS = 2
-
-    def __init__(self, cond, name=None, main_program=None,
-                 startup_program=None):
-        if not isinstance(cond, Variable):
-            raise TypeError("cond must be a Variable")
-        self.helper = LayerHelper(
-            'ifelse',
-            name=name,
-            main_program=main_program,
-            startup_program=startup_program)
-        self.cond = cond
-        self.input_table = {}
-        self.status = IfElse.OUT_IF_ELSE_BLOCKS
-        self.conditional_true_block = ConditionalBlock(inputs=[self.cond])
-        self.conditional_false_block = ConditionalBlock(inputs=[self.cond])
-        self.output_table = ([], [])  # (true_outs, false_outs)
-
-    def input(self, x):
-        if self.status == IfElse.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("input must in true/false blocks")
-        if id(x) not in self.input_table:
-            parent_block = self.parent_block()
-            out_true = parent_block.create_var(
-                name=unique_name('ifelse_input' + self.helper.name),
-                dtype=x.data_type)
-
-            out_false = parent_block.create_var(
-                name=unique_name('ifelse_input' + self.helper.name),
-                dtype=x.data_type)
-            parent_block.append_op(
-                type='split_lod_tensor',
-                inputs={
-                    'X': x,
-                    'Mask': self.cond,
-                },
-                outputs={'OutTrue': out_true,
-                         'OutFalse': out_false},
-                attrs={'level': 0})
-            self.input_table[id(x)] = (out_true, out_false)
-        else:
-            out_true, out_false = self.input_table[id(x)]
-
-        if self.status == IfElse.IN_IF_ELSE_TRUE_BLOCKS:
-            return out_true
-        else:
-            return out_false
-
-    def parent_block(self):
-        current_block = self.helper.main_program.current_block()
-        return self.helper.main_program.block(current_block.parent_idx)
-
-    def true_block(self):
-        return IfElseBlockGuard(True, self)
-
-    def false_block(self):
-        return IfElseBlockGuard(False, self)
-
-    def output(self, *outs):
-        if self.status == self.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("output can only be invoked in the sub-block")
-
-        out_table = self.output_table[1 if self.status ==
-                                      self.IN_IF_ELSE_TRUE_BLOCKS else 0]
-        parent_block = self.parent_block()
-        for each_out in outs:
-            if not isinstance(each_out, Variable):
-                raise TypeError("Each output should be a variable")
-            # create outside tensor
-            outside_out = parent_block.create_var(
-                name=unique_name("_".join([self.helper.name, 'output'])),
-                dtype=each_out.data_type)
-            out_table.append(outside_out)
-
-            # assign local var to outside
-            assign(
-                input=each_out,
-                output=outside_out,
-                main_program=self.helper.main_program,
-                startup_program=self.helper.startup_program)
-
-    def __call__(self):
-        if self.status != self.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("IfElse::__call__ must be out of sub-block")
-        false_len, true_len = map(len, self.output_table)
-        if false_len == 0 and true_len == 0:
-            raise ValueError("Must invoke true_block/false_block before "
-                             "__call__")
-        elif false_len != true_len and false_len != 0 and true_len != 0:
-            raise ValueError("The output side must be same")
-        elif false_len == 0 or true_len == 0:
-            return self.output_table[0 if false_len != 0 else 1]
-
-        # else none of false_len/true_len is zero
-        # merge together
-        rlist = []
-        for false_var, true_var in zip(*self.output_table):
-            rlist.append(
-                merge_lod_tensor(
-                    in_true=true_var,
-                    in_false=false_var,
-                    mask=self.cond,
-                    x=self.cond,
-                    level=0,
-                    main_program=self.helper.main_program,
-                    startup_program=self.helper.startup_program))
-        return rlist
diff --git a/python/paddle/v2/fluid/layers/__init__.py b/python/paddle/v2/fluid/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c190af3329409e5b87b182a11a84dc87dfc46d6e
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/__init__.py
@@ -0,0 +1,33 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import ops
+from ops import *
+import nn
+from nn import *
+import io
+from io import *
+import tensor
+from tensor import *
+import control_flow
+from control_flow import *
+import device
+from device import *
+
+__all__ = []
+__all__ += nn.__all__
+__all__ += io.__all__
+__all__ += tensor.__all__
+__all__ += control_flow.__all__
+__all__ += ops.__all__
+__all__ += device.__all__
diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py
new file mode 100644
index 0000000000000000000000000000000000000000..e72b22c83f65789b9e5d81611bec602d8d78be6b
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -0,0 +1,1472 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from ..layer_helper import LayerHelper, unique_name
+from ..framework import Program, Variable, Operator
+from .. import core
+from tensor import assign, fill_constant
+import contextlib
+from ..registry import autodoc
+
+__all__ = [
+    'split_lod_tensor', 'merge_lod_tensor', 'BlockGuard',
+    'BlockGuardWithCompletion', 'StaticRNNMemoryLink', 'WhileGuard', 'While',
+    'lod_rank_table', 'max_sequence_len', 'topk', 'lod_tensor_to_array',
+    'array_to_lod_tensor', 'increment', 'array_write', 'create_array',
+    'less_than', 'array_read', 'shrink_memory', 'array_length', 'IfElse',
+    'DynamicRNN', 'ConditionalBlock', 'StaticRNN', 'reorder_lod_tensor_by_rank',
+    'ParallelDo', 'Print'
+]
+
+
+def split_lod_tensor(input, mask, level=0):
+    """
+    **split_lod_tensor**
+
+    This function takes in an input that contains the complete lod information,
+    and takes in a mask which is used to mask certain parts of the input.
+    The output is the true branch and the false branch with the mask applied to
+    the input at a certain level in the tensor.
+
+    Args:
+        input(tuple|list|None): The input tensor that contains complete
+                                lod information needed to construct the output.
+        mask(list): A bool column vector which masks the input.
+        level(int): The specific lod level to rank.
+
+    Returns:
+        Variable: The true branch of tensor as per the mask applied to input.
+        Variable: The false branch of tensor as per the mask applied to input.
+
+    Examples:
+        .. code-block:: python
+
+          x = layers.data(name='x', shape=[1])
+          x.persistable = True
+
+          y = layers.data(name='y', shape=[1])
+          y.persistable = True
+
+          out_true, out_false = layers.split_lod_tensor(
+                input=x, mask=y, level=level)
+    """
+    helper = LayerHelper('split_lod_tensor', **locals())
+    out_true = helper.create_tmp_variable(dtype=input.dtype)
+    out_false = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='split_lod_tensor',
+        inputs={
+            'X': input,
+            'Mask': mask,
+        },
+        outputs={'OutTrue': out_true,
+                 'OutFalse': out_false},
+        attrs={'level': level})
+    return out_true, out_false
+
+
+def merge_lod_tensor(in_true, in_false, x, mask, level=0):
+    """
+    **merge_lod_tensor**
+
+    This function takes in an input :math:`x`, the True branch, the False
+    branch and a binary :math:`mask`. Using this information, this function
+    merges the True and False branches of the tensor into a single Output
+    at a certain lod level indiacted by :math:`level`.
+
+    Args:
+        in_true(tuple|list|None): The True branch to be merged.
+        in_false(tuple|list|None): The False branch to be merged.
+        x(tuple|list|None): The input tensor that contains complete
+                            lod information needed to construct the output.
+        mask(list): A bool column vector which masks the input.
+        level(int): The specific lod level to rank.
+
+    Returns:
+        Variable: The merged output tensor.
+
+    Examples:
+        .. code-block:: python
+
+          x = layers.data(
+                      name='x', shape=[1], dtype='float32', stop_gradient=False)
+          y = layers.data(
+                name='y', shape=[1], dtype='bool', stop_gradient=False)
+
+          level = 0
+
+          out_true, out_false = layers.split_lod_tensor(
+                input=x, mask=y, level=level)
+          out = layers.merge_lod_tensor(
+                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
+    """
+    helper = LayerHelper('merge_lod_tensor', **locals())
+    out = helper.create_tmp_variable(dtype=in_true.dtype)
+    helper.append_op(
+        type='merge_lod_tensor',
+        inputs={'X': x,
+                'Mask': mask,
+                'InTrue': in_true,
+                'InFalse': in_false},
+        outputs={'Out': out},
+        attrs={'level': level})
+    return out
+
+
+def Print(input,
+          first_n=-1,
+          message=None,
+          summarize=-1,
+          print_tensor_name=True,
+          print_tensor_type=True,
+          print_tensor_shape=True,
+          print_tensor_lod=True,
+          print_phase='both'):
+    '''
+    **Print operator**
+
+    This creates a print op that will print when a tensor is accessed.
+
+    Wraps the tensor passed in so that whenever that a tensor is accessed,
+    the message `message` is printed, along with the current value of the
+    tensor `t`.
+
+    Args:
+        input (Variable): A Tensor to print.
+        summarize (int): Print this number of elements in the tensor, will print
+                all if left is negative.
+        message (str): A string message to print as a prefix.
+        first_n (int): Only log `first_n` number of times.
+        print_tensor_name (bool): Print the tensor name.
+        print_tensor_type (bool): Print the tensor type.
+        print_tensor_shape (bool): Print the tensor shape.
+        print_tensor_lod (bool): Print the tensor lod.
+        print_phase (bool): Which phase to displace, including 'forward',
+                'backward' and 'both'. If set to 'backward' or 'both', will
+                print the gradients of input tensor.
+
+    Returns:
+        Variable: Output tensor, same data with input tensor.
+
+    Examples:
+        .. code-block:: python
+
+        value = some_layer(...)
+        Print(value, summarize=10,
+              message="The content of some_layer: ")
+    '''
+    helper = LayerHelper('print', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='print',
+        inputs={'In': input},
+        attrs={
+            'first_n': first_n,
+            'summarize': summarize,
+            'message': message or "",
+            'print_tensor_name': print_tensor_name,
+            'print_tensor_type': print_tensor_type,
+            'print_tensor_shape': print_tensor_shape,
+            'print_tensor_lod': print_tensor_lod,
+            'print_phase': print_phase.upper()
+        },
+        outputs={'Out': out})
+    return out
+
+
+class BlockGuard(object):
+    """
+    BlockGuard class.
+
+    BlockGuard class is used to create a sub-block in a program by
+    using the Python `with` keyword.
+    """
+
+    def __init__(self, main_program):
+        if not isinstance(main_program, Program):
+            raise TypeError("BlockGuard takes a program")
+        self.main_program = main_program
+
+    def __enter__(self):
+        self.main_program.create_block()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.main_program.rollback()
+        if exc_type is not None:
+            return False  # re-raise exception
+        return True
+
+
+class ParallelDo(object):
+    """
+    ParallelDo class.
+
+    ParallelDo class is used to create a ParallelDo.
+    """
+
+    def __init__(self, places, name=None):
+        self.helper = LayerHelper("parallel_do", name=name)
+        self.inputs = []
+        self.places = places
+        self.outputs = []
+        self.status = StaticRNN.BEFORE_RNN_BLOCK
+
+    def do(self):
+        return BlockGuardWithCompletion(self)
+
+    def parent_block(self):
+        prog = self.helper.main_program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+        return parent_block
+
+    def __call__(self, *args, **kwargs):
+        if self.status != StaticRNN.AFTER_RNN_BLOCK:
+            raise ValueError("RNN output can only be retrieved after rnn block")
+        if len(self.outputs) == 0:
+            raise ValueError("RNN has no output")
+        elif len(self.outputs) == 1:
+            return self.outputs[0]
+        else:
+            return self.outputs
+
+    def read_input(self, var):
+        self.inputs.append(var)
+        return var
+
+    def write_output(self, var):
+        self.outputs.append(var)
+
+    def get_parameters(self):
+        main_program = self.helper.main_program
+        current_block = main_program.current_block()
+        parent_block = self.parent_block()
+
+        local_inputs = set()
+
+        for op in current_block.ops:
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    local_inputs.add(out_var_name)
+
+        for var in self.inputs:
+            local_inputs.add(var.name)
+
+        params = list()
+        for op in current_block.ops:
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in local_inputs:
+                        params.append(in_var_name)
+
+        return [parent_block.var(name) for name in params]
+
+    def complete_op(self):
+        main_program = self.helper.main_program
+        current_block = main_program.current_block()
+        parent_block = self.parent_block()
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        self.outputs = [
+            parent_block.create_var(
+                name=o.name,
+                shape=o.shape,
+                dtype=o.dtype,
+                lod_level=o.lod_level,
+                persistable=o.persistable,
+                stop_gradient=o.stop_gradient) for o in self.outputs
+        ]
+
+        inputs = [parent_block.var(i.name) for i in self.inputs]
+        outputs = [parent_block.var(o.name) for o in self.outputs]
+
+        parent_block.append_op(
+            type='parallel_do',
+            inputs={
+                'inputs': inputs,
+                'parameters': self.get_parameters(),
+                'places': self.places
+            },
+            outputs={'outputs': outputs,
+                     'parallel_scopes': [step_scope]},
+            attrs={'sub_block': current_block})
+
+
+class BlockGuardWithCompletion(BlockGuard):
+    """
+    BlockGuardWithCompletion class.
+
+    BlockGuardWithCompletion class is used to create an op with a block in a program.
+    """
+
+    def __init__(self, rnn):
+        if not (isinstance(rnn, StaticRNN) or isinstance(rnn, ParallelDo)):
+            raise TypeError(
+                "BlockGuardWithCompletion takes a StaticRNN or ParallelDo")
+        super(BlockGuardWithCompletion, self).__init__(rnn.helper.main_program)
+        self.rnn = rnn
+
+    def __enter__(self):
+        self.rnn.status = StaticRNN.IN_RNN_BLOCK
+        return super(BlockGuardWithCompletion, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+        self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
+        self.rnn.complete_op()
+        return super(BlockGuardWithCompletion, self).__exit__(exc_type, exc_val,
+                                                              exc_tb)
+
+
+class StaticRNNMemoryLink(object):
+    """
+    StaticRNNMemoryLink class.
+
+    Args:
+        init: the initial variable for Memory
+        init: Variable
+        pre_mem: the memory variable in previous time step
+        pre_mem: Variable
+        mem: the memory variable in current time step
+        mem: Variable
+
+    StaticRNNMemoryLink class is used to create a link between two
+    memory cells of a StaticRNN.
+    """
+
+    def __init__(self, init, pre_mem, mem=None):
+        self.init = init
+        self.pre_mem = pre_mem
+        self.mem = mem
+
+
+class StaticRNN(object):
+    """
+    StaticRNN class.
+
+    StaticRNN class is used to create a StaticRNN. The RNN will have its
+    own parameters like inputs, outputs, memories, status and length.
+    """
+    BEFORE_RNN_BLOCK = 0
+    IN_RNN_BLOCK = 1
+    AFTER_RNN_BLOCK = 2
+
+    def __init__(self, name=None):
+        self.helper = LayerHelper("static_rnn", name=name)
+        self.memories = {}  # memory map, from pre_mem.name --> MemoryLink
+        self.inputs = []  # input variable list in current block
+        self.outputs = []  # output variable list in parent block
+        self.status = StaticRNN.BEFORE_RNN_BLOCK  # status flag.
+        # sequence length, since it is a static RNN, sequence length are fixed.
+        self.seq_len = None
+
+    def step(self):
+        return BlockGuardWithCompletion(self)
+
+    def _assert_in_rnn_block_(self, method):
+        if self.status != StaticRNN.IN_RNN_BLOCK:
+            raise ValueError("You must invoke {0} in rnn block".format(method))
+
+    def memory(self,
+               init=None,
+               shape=None,
+               batch_ref=None,
+               init_value=0.0,
+               init_batch_dim_idx=0,
+               ref_batch_dim_idx=1):
+        """
+        Args:
+            init: boot memory, if not set, a shape, batch_ref must be provided
+            shape: shape of the boot memory
+            batch_ref: batch size reference variable
+            init_value: the init value of boot memory
+            init_batch_dim_idx: the index of batch size in init's dimension
+            ref_batch_dim_idx: the index of batch size in batch_ref's dimension
+        """
+        self._assert_in_rnn_block_('memory')
+        if init is None:
+            if shape is None or batch_ref is None:
+                raise ValueError(
+                    "if init is None, memory at least need shape and batch_ref")
+            parent_block = self.parent_block()
+            var_name = unique_name("@".join([self.helper.name, "memory_boot"]))
+            boot_var = parent_block.create_var(
+                name=var_name,
+                shape=shape,
+                dtype=batch_ref.dtype,
+                persistable=False)
+
+            parent_block.append_op(
+                type="fill_constant_batch_size_like",
+                inputs={'Input': [batch_ref]},
+                outputs={'Out': [boot_var]},
+                attrs={
+                    'value': init_value,
+                    'shape': boot_var.shape,
+                    'dtype': boot_var.dtype,
+                    'input_dim_idx': ref_batch_dim_idx,
+                    'output_dim_idx': init_batch_dim_idx
+                })
+
+            return self.memory(init=boot_var)
+        else:
+            pre_mem = self.helper.create_variable(
+                name=unique_name("@".join([self.helper.name, "mem"])),
+                dtype=init.dtype,
+                shape=init.shape)
+            self.memories[pre_mem.name] = StaticRNNMemoryLink(
+                init=init, pre_mem=pre_mem)
+            return pre_mem
+
+    def step_input(self, x):
+        self._assert_in_rnn_block_('step_input')
+        if not isinstance(x, Variable):
+            raise TypeError("step input takes a Variable")
+        if self.seq_len is None:
+            self.seq_len = x.shape[0]
+        elif self.seq_len != x.shape[0]:
+            raise ValueError("Static RNN only take fix seq_len input")
+
+        ipt = self.helper.create_variable(
+            name=x.name, dtype=x.dtype, shape=list(x.shape[1:]), type=x.type)
+        self.inputs.append(ipt)
+        return ipt
+
+    def step_output(self, o):
+        self._assert_in_rnn_block_('step_output')
+        if not isinstance(o, Variable):
+            raise TypeError("step output takes a Variable")
+
+        tmp_o = self.helper.create_tmp_variable(dtype=o.dtype)
+        self.helper.append_op(
+            type='rnn_memory_helper',
+            inputs={'X': [o]},
+            outputs={'Out': tmp_o},
+            attrs={'dtype': o.dtype})
+
+        out_var = self.parent_block().create_var(
+            name=tmp_o.name,
+            shape=[self.seq_len] + list(tmp_o.shape),
+            dtype=tmp_o.dtype)
+
+        self.outputs.append(out_var)
+
+    def output(self, *outputs):
+        for each in outputs:
+            self.step_output(each)
+
+    def update_memory(self, mem, var):
+        if not isinstance(mem, Variable) or not isinstance(var, Variable):
+            raise TypeError("update memory should take variables")
+        self.memories[mem.name].mem = var
+
+    def parent_block(self):
+        prog = self.helper.main_program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+        return parent_block
+
+    def __call__(self, *args, **kwargs):
+        if self.status != StaticRNN.AFTER_RNN_BLOCK:
+            raise ValueError("RNN output can only be retrieved after rnn block")
+        if len(self.outputs) == 0:
+            raise ValueError("RNN has no output")
+        elif len(self.outputs) == 1:
+            return self.outputs[0]
+        else:
+            return self.outputs
+
+    def complete_op(self):
+        main_program = self.helper.main_program
+        rnn_block = main_program.current_block()
+        parent_block = self.parent_block()
+
+        local_inputs = set()
+
+        for op in rnn_block.ops:
+            assert isinstance(op, Operator)
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    local_inputs.add(out_var_name)
+
+        for var in self.inputs:
+            local_inputs.add(var.name)
+        for m in self.memories:
+            local_inputs.add(m)
+
+        params = list()
+        for op in rnn_block.ops:
+            assert isinstance(op, Operator)
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in local_inputs:
+                        params.append(in_var_name)
+
+        parameters = [parent_block.var(name) for name in params]
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        inlinks = [parent_block.var(i.name) for i in self.inputs]
+        outlinks = self.outputs
+
+        boot_memories = []
+        pre_memories = []
+        memories = []
+        for _, mem in self.memories.iteritems():
+            boot_memories.append(mem.init)
+            pre_memories.append(mem.pre_mem.name)
+            mem_var = rnn_block.var(mem.mem.name)
+            assert isinstance(mem_var, Variable)
+            new_mem = self.helper.create_tmp_variable(dtype=mem_var.dtype)
+
+            rnn_block.append_op(
+                type='rnn_memory_helper',
+                inputs={'X': [mem_var]},
+                outputs={'Out': [new_mem]},
+                attrs={'dtype': mem_var.dtype})
+
+            memories.append(new_mem.name)
+
+        parent_block.append_op(
+            type='recurrent',
+            inputs={
+                'inputs': inlinks,
+                'initial_states': boot_memories,
+                'parameters': parameters
+            },
+            outputs={'outputs': outlinks,
+                     'step_scopes': [step_scope]},
+            attrs={
+                'ex_states': pre_memories,
+                'states': memories,
+                'sub_block': rnn_block
+            })
+
+
+class WhileGuard(BlockGuard):
+    def __init__(self, while_op):
+        if not isinstance(while_op, While):
+            raise TypeError("WhileGuard takes a while op")
+        super(WhileGuard, self).__init__(while_op.helper.main_program)
+        self.while_op = while_op
+
+    def __enter__(self):
+        self.while_op.status = While.IN_WHILE_BLOCK
+        return super(WhileGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+        self.while_op.status = While.AFTER_WHILE_BLOCK
+        self.while_op.complete()
+        return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb)
+
+
+class While(object):
+    BEFORE_WHILE_BLOCK = 0
+    IN_WHILE_BLOCK = 1
+    AFTER_WHILE_BLOCK = 2
+
+    def __init__(self, cond, name=None):
+        self.helper = LayerHelper("while", name=name)
+        self.status = While.BEFORE_WHILE_BLOCK
+        if not isinstance(cond, Variable):
+            raise TypeError("condition should be a variable")
+        assert isinstance(cond, Variable)
+        if cond.dtype != core.DataType.BOOL:
+            raise TypeError("condition should be a bool variable")
+        if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
+            raise TypeError("condition should be a bool scalar")
+        self.cond_var = cond
+
+    def block(self):
+        return WhileGuard(self)
+
+    def complete(self):
+        main_program = self.helper.main_program
+        while_block = main_program.current_block()
+        parent_block = main_program.block(main_program.current_block()
+                                          .parent_idx)
+
+        inner_outputs = {self.cond_var.name}
+        x_name_list = set()
+        for op in while_block.ops:
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in inner_outputs:
+                        x_name_list.add(in_var_name)
+
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    inner_outputs.add(out_var_name)
+
+        out_vars = []
+        for inner_out_name in inner_outputs:
+            if inner_out_name in parent_block.vars:
+                out_vars.append(parent_block.var(inner_out_name))
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        parent_block.append_op(
+            type='while',
+            inputs={
+                'X': [parent_block.var(x_name) for x_name in x_name_list],
+                'Condition': [self.cond_var]
+            },
+            outputs={'Out': out_vars,
+                     'StepScopes': [step_scope]},
+            attrs={'sub_block': while_block})
+
+
+def lod_rank_table(x, level=0):
+    """LoD Rank Table Operator. Given an input variable **x** and a level number
+    of LoD, this layer creates a LodRankTable object. A LoDRankTable object
+    contains a list of bi-element tuples. Each tuple consists of an index and
+    a length, both of which are int type. Refering to specified level of LoD,
+    the index is the sequence index number and the length representes the
+    sequence length. Please note that the list is ranked in descending order by
+    the length. The following is an example:
+
+        .. code-block:: text
+
+            x is a LoDTensor:
+                x.lod = [[0,                2, 3],
+                         [0,             5, 6, 7]]
+                x.data = [a, b, c, d, e, f, g]
+
+            1. set level to 0:
+                Create lod rank table:
+                    lod_rank_table_obj = lod_rank_table(x, level=0)
+
+                Get:
+                    lod_rank_table_obj.items() = [(0, 2), (1, 1)]
+
+            2. set level to 1:
+                Create lod rank table:
+                    lod_rank_table_obj = lod_rank_table(x, level=1)
+
+                Get:
+                    lod_rank_table_obj.items() = [(0, 5), (1, 1), (2, 1)]
+
+    Args:
+        x (Variable): Input variable, a LoDTensor based which to create the lod
+            rank table.
+        level (int): Specify the LoD level, on which to create the lod rank
+            table.
+
+    Returns:
+        Variable: The created LoDRankTable object.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10],
+                            dtype='float32', lod_level=1)
+            out = layers.lod_rank_table(x=x, level=0)
+    """
+    helper = LayerHelper("lod_rank_table", **locals())
+    table = helper.create_variable(
+        type=core.VarDesc.VarType.LOD_RANK_TABLE,
+        name=unique_name("lod_rank_table"))
+    helper.append_op(
+        type='lod_rank_table',
+        inputs={'X': x},
+        outputs={'Out': table},
+        attrs={'level': level})
+    return table
+
+
+def max_sequence_len(rank_table):
+    """Max Sequence Len Operator. Given a LoDRankTable object, this layer
+    returns the max length of a batch of sequences. In fact, a LoDRankTable
+    object contains a list of tuples(<sequence index, sequence length>) and
+    the list is already sorted by sequence length in descending order, so the
+    operator just returns the sequence length of the first tuple element.
+
+    Args:
+        rank_table (Variable): Input variable which is a LoDRankTable object.
+
+    Returns:
+        Variable: The max length of sequence.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10],
+                            dtype='float32', lod_level=1)
+            rank_table = layers.lod_rank_table(x=x, level=0)
+            max_seq_len = layers.max_sequence_len(rank_table)
+    """
+    helper = LayerHelper("max_seqence_len", **locals())
+    res = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="max_sequence_len",
+        inputs={"RankTable": rank_table},
+        outputs={"Out": res})
+    return res
+
+
+def topk(input, k):
+    """
+    **topk**
+
+    This function performs the operation that selects the k entries in the input
+    vector and outputs their values and indices as vectors. Thus topk_out[j] is
+    the j-th largest entry in input, and its index is topk_indices[j]
+
+    Args:
+        input (Variable|list): The input tensor that has all the data.
+        k (int): The number of top elements that the function will pick.
+
+    Returns:
+        Variable: The variable of type array that contains the k largest entries
+                  from input.
+        Variable: The variable of type array that contains the indices of k
+                  largest entries from input.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[10])
+          k = 5
+          array = fluid.layers.topk(x, k)
+    """
+    helper = LayerHelper('topk', **locals())
+    topk_out = helper.create_tmp_variable(dtype=input.data_type)
+    topk_indices = helper.create_tmp_variable(dtype='int64')
+    helper.append_op(
+        type='top_k',
+        inputs={'X': [input]},
+        outputs={'Out': [topk_out],
+                 'Indices': [topk_indices]},
+        attrs={'k': k})
+    return topk_out, topk_indices
+
+
+def lod_tensor_to_array(x, table):
+    """ Convert a LOD_TENSOR to an LOD_TENSOR_ARRAY.
+
+    Args:
+        x (Variable|list): The LOD tensor to be converted to a LOD tensor array.
+        table (ParamAttr|list): The variable that stores the level of lod
+                                which is ordered by sequence length in
+                                descending order.
+
+    Returns:
+        Variable: The variable of type array that has been converted from a
+                  tensor.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[10])
+          table = fluid.layers.lod_rank_table(x, level=0)
+          array = fluid.layers.lod_tensor_to_array(x, table)
+    """
+    helper = LayerHelper("lod_tensor_to_array", **locals())
+    array = helper.create_variable(
+        name=unique_name("lod_tensor_to_array"),
+        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        dtype=x.dtype)
+    helper.append_op(
+        type='lod_tensor_to_array',
+        inputs={'X': x,
+                'RankTable': table},
+        outputs={'Out': array})
+    return array
+
+
+def array_to_lod_tensor(x, table):
+    """Convert a LoD_Tensor_Aarry to an LoDTensor.
+
+    Args:
+        x (Variable|list): The lod tensor array to be converted to a tensor.
+        table (ParamAttr|list): The variable that stores the level of lod
+                                which is ordered by sequence length in
+                                descending order.
+
+    Returns:
+        Variable: The variable of type tensor that has been converted
+                  from an array.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[10])
+          table = fluid.layers.lod_rank_table(x, level=0)
+          array = fluid.layers.lod_tensor_to_array(x, table)
+          lod_tensor = fluid.layers.array_to_lod_tensor(array, table)
+    """
+    helper = LayerHelper("array_to_lod_tensor", **locals())
+    tmp = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="array_to_lod_tensor",
+        inputs={'X': x,
+                'RankTable': table},
+        outputs={'Out': tmp})
+    return tmp
+
+
+def increment(x, value=1.0, in_place=True):
+    """
+    This function performs an operation that increments each value in the
+    input :math:`x` by an amount: :math:`value` as mentioned in the input
+    parameter. This operation is performed in-place by default.
+
+    Args:
+        x (Variable|list): The tensor that has the input values.
+        value (float): The amount by which the values should be incremented.
+        in_place (bool): If the increment should be performed in-place.
+
+    Returns:
+        Variable: The tensor variable storing the transformation of
+                  element-wise increment of each value in the input.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32')
+          data = fluid.layers.increment(x=data, value=3.0, in_place=True)
+    """
+    helper = LayerHelper("increment", **locals())
+    if not in_place:
+        out = helper.create_tmp_variable(dtype=x.dtype)
+    else:
+        out = x
+    helper.append_op(
+        type='increment',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={'step': float(value)})
+    return out
+
+
+def array_write(x, i, array=None):
+    """
+    This function writes the given input variable to the specified position
+    indicating by the arrary index to an output LOD_TENSOR_ARRAY. If the
+    output LOD_TENSOR_ARRAY is not given(None), a new one will be created and
+    returned.
+
+    Args:
+        x (Variable|list): The input tensor from which the data will be read.
+        i (Variable|list): The index of the output LOD_TENSOR_ARRAY, pointing to
+                           the position to which the input tensor will be
+                           written.
+        array (Variable|list): The output LOD_TENSOR_ARRAY to which the input
+                               tensor will be written. If this parameter is
+                               NONE, a new LOD_TENSOR_ARRAY will be created and
+                               returned.
+
+    Returns:
+        Variable: The output LOD_TENSOR_ARRAY where the input tensor is written.
+
+    Examples:
+        .. code-block::python
+
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = layers.array_write(tmp, i=i)
+    """
+    helper = LayerHelper('array_write', **locals())
+    if array is None:
+        array = helper.create_variable(
+            name="{0}.out".format(helper.name),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=x.dtype)
+    helper.append_op(
+        type='write_to_array',
+        inputs={'X': [x],
+                'I': [i]},
+        outputs={'Out': [array]})
+    return array
+
+
+def create_array(dtype):
+    """This function creates an array of type :math:`LOD_TENSOR_ARRAY` using the
+    LayerHelper.
+
+    Args:
+        dtype (int|float): The data type of the elements in the array.
+
+    Returns:
+        Variable: The tensor variable storing the elements of data type.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.create_array(dtype='float32')
+
+    """
+    helper = LayerHelper("array", **locals())
+    return helper.create_variable(
+        name="{0}.out".format(helper.name),
+        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        dtype=dtype)
+
+
+def less_than(x, y, cond=None, **ignored):
+    """
+    **Less than**
+
+    This layer returns the truth value of :math:`x < y` elementwise.
+
+    Args:
+        x(Variable): First operand of *less_than*
+        y(Variable): Second operand of *less_than*
+        cond(Variable|None): Optional output variable to store the result of *less_than*
+
+    Returns:
+        Variable: The tensor variable storing the output of *less_than*.
+
+    Examples:
+        .. code-block:: python
+
+          less = fluid.layers.less_than(x=label, y=limit)
+    """
+    helper = LayerHelper("less_than", **locals())
+    if cond is None:
+        cond = helper.create_tmp_variable(dtype='bool')
+        cond.stop_gradient = True
+
+    helper.append_op(
+        type='less_than', inputs={'X': [x],
+                                  'Y': [y]}, outputs={'Out': [cond]})
+    return cond
+
+
+def array_read(array, i):
+    """This function performs the operation to read the data in as an
+    LOD_TENSOR_ARRAY.
+    Args:
+        array (Variable|list): The input tensor that will be written to an array.
+        i (Variable|list): The subscript index in tensor array, that points the
+                           place where data will be written to.
+    Returns:
+        Variable: The tensor type variable that has the data written to it.
+    Examples:
+        .. code-block::python
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = layers.array_read(tmp, i=i)
+    """
+    helper = LayerHelper('array_read', **locals())
+    if not isinstance(
+            array,
+            Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+        raise TypeError("array should be tensor array vairable")
+    out = helper.create_tmp_variable(dtype=array.dtype)
+    helper.append_op(
+        type='read_from_array',
+        inputs={'X': [array],
+                'I': [i]},
+        outputs={'Out': [out]})
+    return out
+
+
+def shrink_memory(x, i, table):
+    """
+    This function creates an operator to shrink_rnn_memory using the RankTable
+    as mentioned in the input parameter.
+    """
+    helper = LayerHelper('shrink_memory', **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='shrink_rnn_memory',
+        inputs={'X': [x],
+                'I': [i],
+                'RankTable': [table]},
+        outputs={'Out': [out]},
+        attrs={})
+    return out
+
+
+def array_length(array):
+    """This function performs the operation to find the length of the input
+    LOD_TENSOR_ARRAY.
+
+    Args:
+        array (LOD_TENSOR_ARRAY): The input array that will be used
+                                  to compute the length.
+
+    Returns:
+        Variable: The length of the input LoDTensorArray.
+
+    Examples:
+        .. code-block::python
+
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = fluid.layers.array_write(tmp, i=i)
+          arr_len = fluid.layers.array_length(arr)
+    """
+    helper = LayerHelper('array_length', **locals())
+    tmp = helper.create_tmp_variable(dtype='int64')
+    tmp.stop_gradient = True
+    helper.append_op(
+        type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
+    return tmp
+
+
+class ConditionalBlockGuard(BlockGuard):
+    def __init__(self, block):
+        if not isinstance(block, ConditionalBlock):
+            raise TypeError("block should be conditional block")
+        super(ConditionalBlockGuard, self).__init__(block.helper.main_program)
+        self.block = block
+
+    def __enter__(self):
+        return super(ConditionalBlockGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.block.complete()
+        return super(ConditionalBlockGuard, self).__exit__(exc_type, exc_val,
+                                                           exc_tb)
+
+
+class ConditionalBlock(object):
+    def __init__(self, inputs, name=None):
+        for each_input in inputs:
+            if not isinstance(each_input, Variable):
+                raise TypeError("Each input should be variable")
+        self.inputs = inputs
+        self.helper = LayerHelper('conditional_block', name=name)
+
+    def block(self):
+        return ConditionalBlockGuard(self)
+
+    def complete(self):
+        inside_block = self.helper.main_program.current_block()
+        parent_block = self.helper.main_program.block(inside_block.parent_idx)
+
+        intermediate = set()
+        params = set()
+
+        for each_op in inside_block.ops:
+            assert isinstance(each_op, Operator)
+            for iname in each_op.input_names:
+                for in_var_name in each_op.input(iname):
+                    if in_var_name not in intermediate:
+                        params.add(in_var_name)
+
+            for oname in each_op.output_names:
+                for out_var_name in each_op.output(oname):
+                    intermediate.add(out_var_name)
+        input_set = set([ipt.name for ipt in self.inputs])
+
+        param_list = [
+            parent_block.var(each_name) for each_name in params
+            if each_name not in input_set
+        ]
+
+        out_list = [
+            parent_block.var(var_name) for var_name in parent_block.vars
+            if var_name in intermediate
+        ]
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+        parent_block.append_op(
+            type='conditional_block',
+            inputs={
+                'X': self.inputs,
+                'Params': param_list,
+            },
+            outputs={'Out': out_list,
+                     'Scope': [step_scope]},
+            attrs={'sub_block': inside_block})
+
+
+class IfElseBlockGuard(object):
+    def __init__(self, is_true, ifelse):
+        if not isinstance(ifelse, IfElse):
+            raise TypeError("ifelse must be an instance of IfElse class")
+
+        if ifelse.status != IfElse.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("You cannot invoke IfElse.block() inside a block")
+
+        self.is_true = is_true
+        self.ie = ifelse
+        if is_true:
+            self.cond_block = ifelse.conditional_true_block
+        else:
+            self.cond_block = ifelse.conditional_false_block
+
+        if not isinstance(self.cond_block, ConditionalBlock):
+            raise TypeError("Unexpected situation")
+
+        self.cond_block = self.cond_block.block()
+
+    def __enter__(self):
+        self.ie.status = IfElse.IN_IF_ELSE_TRUE_BLOCKS if self.is_true else IfElse.IN_IF_ELSE_FALSE_BLOCKS
+        self.cond_block.__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.cond_block.__exit__(exc_type, exc_val, exc_tb):
+            # re-raise inside exception
+            return False
+        if len(self.ie.output_table[1 if self.is_true else 0]) == 0:
+            raise ValueError("Must set output inside block")
+        self.ie.status = IfElse.OUT_IF_ELSE_BLOCKS
+
+
+class IfElse(object):
+    OUT_IF_ELSE_BLOCKS = 0
+    IN_IF_ELSE_TRUE_BLOCKS = 1
+    IN_IF_ELSE_FALSE_BLOCKS = 2
+
+    def __init__(self, cond, name=None):
+        if not isinstance(cond, Variable):
+            raise TypeError("cond must be a Variable")
+        self.helper = LayerHelper('ifelse', name=name)
+        self.cond = cond
+        self.input_table = {}
+        self.status = IfElse.OUT_IF_ELSE_BLOCKS
+        self.conditional_true_block = ConditionalBlock(inputs=[self.cond])
+        self.conditional_false_block = ConditionalBlock(inputs=[self.cond])
+        self.output_table = ([], [])  # (true_outs, false_outs)
+
+    def input(self, x):
+        if self.status == IfElse.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("input must in true/false blocks")
+        if id(x) not in self.input_table:
+            parent_block = self.parent_block()
+            out_true = parent_block.create_var(
+                name=unique_name('ifelse_input' + self.helper.name),
+                dtype=x.dtype)
+
+            out_false = parent_block.create_var(
+                name=unique_name('ifelse_input' + self.helper.name),
+                dtype=x.dtype)
+            parent_block.append_op(
+                type='split_lod_tensor',
+                inputs={
+                    'X': x,
+                    'Mask': self.cond,
+                },
+                outputs={'OutTrue': out_true,
+                         'OutFalse': out_false},
+                attrs={'level': 0})
+            self.input_table[id(x)] = (out_true, out_false)
+        else:
+            out_true, out_false = self.input_table[id(x)]
+
+        if self.status == IfElse.IN_IF_ELSE_TRUE_BLOCKS:
+            return out_true
+        else:
+            return out_false
+
+    def parent_block(self):
+        current_block = self.helper.main_program.current_block()
+        return self.helper.main_program.block(current_block.parent_idx)
+
+    def true_block(self):
+        return IfElseBlockGuard(True, self)
+
+    def false_block(self):
+        return IfElseBlockGuard(False, self)
+
+    def output(self, *outs):
+        if self.status == self.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("output can only be invoked in the sub-block")
+
+        out_table = self.output_table[1 if self.status ==
+                                      self.IN_IF_ELSE_TRUE_BLOCKS else 0]
+        parent_block = self.parent_block()
+        for each_out in outs:
+            if not isinstance(each_out, Variable):
+                raise TypeError("Each output should be a variable")
+            # create outside tensor
+            outside_out = parent_block.create_var(
+                name=unique_name("_".join([self.helper.name, 'output'])),
+                dtype=each_out.dtype)
+            out_table.append(outside_out)
+
+            # assign local var to outside
+            assign(input=each_out, output=outside_out)
+
+    def __call__(self):
+        if self.status != self.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("IfElse::__call__ must be out of sub-block")
+        false_len, true_len = map(len, self.output_table)
+        if false_len == 0 and true_len == 0:
+            raise ValueError("Must invoke true_block/false_block before "
+                             "__call__")
+        elif false_len != true_len and false_len != 0 and true_len != 0:
+            raise ValueError("The output side must be same")
+        elif false_len == 0 or true_len == 0:
+            return self.output_table[0 if false_len != 0 else 1]
+
+        # else none of false_len/true_len is zero
+        # merge together
+        rlist = []
+        for false_var, true_var in zip(*self.output_table):
+            rlist.append(
+                merge_lod_tensor(
+                    in_true=true_var,
+                    in_false=false_var,
+                    mask=self.cond,
+                    x=self.cond,
+                    level=0))
+        return rlist
+
+
+class DynamicRNN(object):
+    BEFORE_RNN = 0
+    IN_RNN = 1
+    AFTER_RNN = 2
+
+    def __init__(self, name=None):
+        self.helper = LayerHelper('dynamic_rnn', name=name)
+        self.status = DynamicRNN.BEFORE_RNN
+        self.lod_rank_table = None
+        self.max_seq_len = None
+        self.step_idx = None
+        self.zero_idx = fill_constant(
+            shape=[1], value=0, dtype='int64', force_cpu=True)
+        self.mem_dict = dict()
+        self.output_array = []
+        self.outputs = []
+        self.cond = self.helper.create_tmp_variable(dtype='bool')
+        self.cond.stop_gradient = False
+        self.while_op = While(self.cond)
+        self.input_array = []
+        self.mem_link = []
+
+    def step_input(self, x):
+        self._assert_in_rnn_block_("step_input")
+        if not isinstance(x, Variable):
+            raise TypeError(
+                "step_input() can only take a Variable as its input.")
+        parent_block = self._parent_block_()
+        if self.lod_rank_table is None:
+            self.lod_rank_table = parent_block.create_var(
+                name=unique_name('lod_rank_table'),
+                type=core.VarDesc.VarType.LOD_RANK_TABLE)
+            self.lod_rank_table.stop_gradient = True
+            parent_block.append_op(
+                type='lod_rank_table',
+                inputs={"X": x},
+                outputs={"Out": self.lod_rank_table})
+            self.max_seq_len = parent_block.create_var(
+                name=unique_name('dynamic_rnn_max_seq_len'), dtype='int64')
+            self.max_seq_len.stop_gradient = False
+            parent_block.append_op(
+                type='max_sequence_len',
+                inputs={'RankTable': self.lod_rank_table},
+                outputs={"Out": self.max_seq_len})
+            self.cond.stop_gradient = True
+            parent_block.append_op(
+                type='less_than',
+                inputs={'X': self.step_idx,
+                        'Y': self.max_seq_len},
+                outputs={'Out': self.cond})
+
+        input_array = parent_block.create_var(
+            name=unique_name('dynamic_rnn_input_array'),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=x.dtype)
+        self.input_array.append((input_array, x.dtype))
+        parent_block.append_op(
+            type='lod_tensor_to_array',
+            inputs={'X': x,
+                    'RankTable': self.lod_rank_table},
+            outputs={'Out': input_array})
+        return array_read(array=input_array, i=self.step_idx)
+
+    def static_input(self, x):
+        self._assert_in_rnn_block_("static_input")
+        if not isinstance(x, Variable):
+            raise TypeError(
+                "static_input() can only take a Variable as its input")
+        if self.lod_rank_table is None:
+            raise RuntimeError(
+                "static_input() must be called after step_input().")
+        parent_block = self._parent_block_()
+        x_reordered = parent_block.create_var(
+            name=unique_name("dynamic_rnn_static_input_reordered"),
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            dtype=x.dtype)
+        parent_block.append_op(
+            type='reorder_lod_tensor_by_rank',
+            inputs={'X': [x],
+                    'RankTable': [self.lod_rank_table]},
+            outputs={'Out': [x_reordered]})
+        return shrink_memory(x_reordered, self.step_idx, self.lod_rank_table)
+
+    @contextlib.contextmanager
+    def block(self):
+        if self.status != DynamicRNN.BEFORE_RNN:
+            raise ValueError("rnn.block() can only be invoke once")
+        self.step_idx = fill_constant(
+            shape=[1], dtype='int64', value=0, force_cpu=True)
+        self.step_idx.stop_gradient = False
+        self.status = DynamicRNN.IN_RNN
+        with self.while_op.block():
+            yield
+            increment(x=self.step_idx, value=1.0, in_place=True)
+
+            for new_mem, mem_array in self.mem_link:
+                array_write(x=new_mem, i=self.step_idx, array=mem_array)
+
+            less_than(x=self.step_idx, y=self.max_seq_len, cond=self.cond)
+
+        self.status = DynamicRNN.AFTER_RNN
+        for each_array in self.output_array:
+            self.outputs.append(
+                array_to_lod_tensor(
+                    x=each_array, table=self.lod_rank_table))
+
+    def __call__(self, *args, **kwargs):
+        if self.status != DynamicRNN.AFTER_RNN:
+            raise ValueError(("Output of the dynamic RNN can only be visited "
+                              "outside the rnn block."))
+        if len(self.outputs) == 1:
+            return self.outputs[0]
+        else:
+            return self.outputs
+
+    def memory(self,
+               init=None,
+               shape=None,
+               value=0.0,
+               need_reorder=False,
+               dtype='float32'):
+        self._assert_in_rnn_block_('memory')
+        if init is not None:
+            if not isinstance(init, Variable):
+                raise TypeError(
+                    "The input arg `init` of memory() must be a Variable")
+            parent_block = self._parent_block_()
+            init_tensor = init
+            if need_reorder == True:
+                if self.lod_rank_table is None:
+                    raise ValueError(
+                        'If set need_reorder to True, make sure step_input be '
+                        'invoked before '
+                        'memory(init=init, need_reordered=True, ...).')
+                init_reordered = parent_block.create_var(
+                    name=unique_name('dynamic_rnn_mem_init_reordered'),
+                    type=core.VarDesc.VarType.LOD_TENSOR,
+                    dtype=init.dtype)
+                parent_block.append_op(
+                    type='reorder_lod_tensor_by_rank',
+                    inputs={
+                        'X': [init_tensor],
+                        'RankTable': [self.lod_rank_table]
+                    },
+                    outputs={'Out': [init_reordered]})
+                init_tensor = init_reordered
+            mem_array = parent_block.create_var(
+                name=unique_name('dynamic_rnn_mem_array'),
+                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                dtype=init.dtype)
+            parent_block.append_op(
+                type='write_to_array',
+                inputs={'X': init_tensor,
+                        'I': self.zero_idx},
+                outputs={'Out': mem_array})
+            retv = array_read(array=mem_array, i=self.step_idx)
+            retv = shrink_memory(
+                x=retv, i=self.step_idx, table=self.lod_rank_table)
+            self.mem_dict[retv.name] = mem_array
+            return retv
+        else:
+            if len(self.input_array) == 0:
+                raise ValueError(
+                    "step_input should be invoked before memory(shape=..., value=...)"
+                )
+            parent_block = self._parent_block_()
+            init = parent_block.create_var(
+                name=unique_name('mem_init'), dtype=dtype)
+            arr, dtype = self.input_array[0]
+            in0 = parent_block.create_var(name=unique_name('in0'), dtype=dtype)
+            parent_block.append_op(
+                type='read_from_array',
+                inputs={'X': [arr],
+                        'I': [self.zero_idx]},
+                outputs={'Out': [in0]})
+            parent_block.append_op(
+                type='fill_constant_batch_size_like',
+                inputs={'Input': [in0]},
+                outputs={'Out': [init]},
+                attrs={
+                    'shape': [-1] + shape,
+                    'value': float(value),
+                    'dtype': init.dtype
+                })
+            return self.memory(init=init)
+
+    def update_memory(self, ex_mem, new_mem):
+        self._assert_in_rnn_block_('update_memory')
+        if not isinstance(ex_mem, Variable):
+            raise TypeError("The input arg `ex_mem` of update_memory() must "
+                            "be a Variable")
+        if not isinstance(new_mem, Variable):
+            raise TypeError("The input arg `new_mem` of update_memory() must "
+                            "be a Variable")
+
+        mem_array = self.mem_dict.get(ex_mem.name, None)
+        if mem_array is None:
+            raise ValueError("Please invoke memory before update_memory")
+        if self.lod_rank_table is None:
+            raise ValueError("Please invoke step_input before update_memory")
+
+        self.mem_link.append((new_mem, mem_array))
+
+    def output(self, *outputs):
+        self._assert_in_rnn_block_('output')
+        parent_block = self._parent_block_()
+        for each in outputs:
+            outside_array = parent_block.create_var(
+                name=unique_name("_".join(
+                    [self.helper.name, "output_array", each.name])),
+                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                dtype=each.dtype)
+            array_write(x=each, i=self.step_idx, array=outside_array)
+            self.output_array.append(outside_array)
+
+    def _parent_block_(self):
+        prog = self.helper.main_program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+
+        return parent_block
+
+    def _assert_in_rnn_block_(self, method):
+        if self.status != DynamicRNN.IN_RNN:
+            raise ValueError("{0} can only be invoked inside rnn block.".format(
+                method))
+
+
+@autodoc
+def reorder_lod_tensor_by_rank(x, rank_table):
+    helper = LayerHelper('reorder_lod_tensor_by_rank', **locals())
+    helper.is_instance('x', Variable)
+    helper.is_instance('rank_table', Variable)
+
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='reorder_lod_tensor_by_rank',
+        inputs={'X': [x],
+                'RankTable': [rank_table]},
+        outputs={'Out': [out]})
+    return out
diff --git a/python/paddle/v2/fluid/layers/device.py b/python/paddle/v2/fluid/layers/device.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef74b2b2f08ae446b612a5e5552344b3901d8178
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/device.py
@@ -0,0 +1,38 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+"""
+All util layers.
+"""
+
+from ..layer_helper import LayerHelper
+from ..framework import unique_name
+from ..registry import autodoc
+
+__all__ = ['get_places']
+
+
+@autodoc
+def get_places(device_count=None, device_type=None):
+    helper = LayerHelper('get_places', **locals())
+    out_places = helper.create_variable(name=unique_name(helper.name + ".out"))
+    attrs = dict()
+    if device_count is not None:
+        attrs['device_count'] = int(device_count)
+    if device_type is not None:
+        attrs['device_type'] = str(device_type)
+
+    helper.append_op(
+        type='get_places', outputs={"Out": [out_places]}, attrs=attrs)
+
+    return out_places
diff --git a/python/paddle/v2/fluid/layers/io.py b/python/paddle/v2/fluid/layers/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..6177f0b4d71cdda8637fc40e3e65c72842bf7439
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/io.py
@@ -0,0 +1,75 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from .. import core
+from ..layer_helper import LayerHelper
+
+__all__ = ['data']
+
+
+def data(name,
+         shape,
+         append_batch_size=True,
+         dtype='float32',
+         lod_level=0,
+         type=core.VarDesc.VarType.LOD_TENSOR,
+         stop_gradient=True):
+    """
+    **Data Layer**
+
+    This function takes in the input and based on whether data has
+    to be returned back as a minibatch, it creates the global variable using
+    the helper functions. The global variables can be accessed by all the
+    following operations and layers in the graph.
+
+    All the input variables of this function are passed in as local variables
+    to the LayerHelper constructor.
+
+    Args:
+       name(str): The name/alias of the function
+       shape(list): Tuple declaring the shape.
+       append_batch_size(bool): Whether or not to append the data as a batch.
+       dtype(int|float): The type of data : float32, float_16, int etc
+       type(VarType): The output type. By default it is LOD_TENSOR.
+       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
+       main_program(Program): Name of the main program that calls this
+       startup_program(Program): Name of the startup program
+       stop_gradient(bool): A boolean that mentions whether gradient should flow.
+
+    Returns:
+        Variable: The global variable that gives access to the data.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='x', shape=[784], dtype='float32')
+    """
+    helper = LayerHelper('data', **locals())
+    shape = list(shape)
+    for i in xrange(len(shape)):
+        if shape[i] is None:
+            shape[i] = -1
+            append_batch_size = False
+        elif shape[i] < 0:
+            append_batch_size = False
+
+    if append_batch_size:
+        shape = [-1] + shape  # append batch size as -1
+
+    return helper.create_global_variable(
+        name=name,
+        shape=shape,
+        dtype=dtype,
+        type=type,
+        stop_gradient=stop_gradient,
+        lod_level=lod_level)
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e8fd407c9983e2827d3137fa4f49a8425d5dce2
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -0,0 +1,1599 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+"""
+All layers just related to the neural network.
+"""
+
+from ..layer_helper import LayerHelper
+from ..initializer import Normal, Constant
+from ..framework import Variable
+from ..param_attr import ParamAttr
+from tensor import concat
+
+__all__ = [
+    'fc',
+    'embedding',
+    'dynamic_lstm',
+    'gru_unit',
+    'linear_chain_crf',
+    'crf_decoding',
+    'cos_sim',
+    'cross_entropy',
+    'square_error_cost',
+    'accuracy',
+    'chunk_eval',
+    'sequence_conv',
+    'conv2d',
+    'sequence_pool',
+    'pool2d',
+    'batch_norm',
+    'beam_search_decode',
+    'conv2d_transpose',
+    'sequence_expand',
+    'lstm_unit',
+    'reduce_sum',
+    'reduce_mean',
+    'reduce_max',
+    'reduce_min',
+    'sequence_first_step',
+    'sequence_last_step',
+    'dropout',
+    'split',
+]
+
+
+def fc(input,
+       size,
+       num_flatten_dims=1,
+       param_attr=None,
+       bias_attr=None,
+       act=None,
+       name=None):
+    """
+    **Fully Connected Layer**
+
+    The fully connected layer can take multiple tensors as its inputs. It
+    creates a variable (one for each input tensor) called weights for each input
+    tensor, which represents a fully connected weight matrix from each input
+    unit to each output unit. The fully connected layer multiplies each input
+    tensor with its coresponding weight to produce an output Tensor. If
+    multiple input tensors are given, the results of multiple multiplications
+    will be sumed up. If bias_attr is not None, a biases variable will be
+    created and added to the output. Finally, if activation is not None,
+    it will be applied to the output as well.
+
+    This process can be formulated as follows:
+
+    .. math::
+
+        Out = Act({\sum_{i=0}^{N-1}W_iX_i + b})
+
+    In the above equation:
+
+    * :math:`N`: Number of the input.
+    * :math:`X_i`: The input tensor.
+    * :math:`W`: The weights created by this layer.
+    * :math:`b`: The bias parameter created by this layer (if needed).
+    * :math:`Act`: The activation funtion.
+    * :math:`Out`: The output tensor.
+
+    Args:
+       input(Variable|list): The input tensor(s) to the fully connected layer.
+       size(int): The number of output units in the fully connected layer.
+       num_flatten_dims(int): The fc layer can accept an input tensor with more
+                              than two dimensions. If this happens, the
+                              multidimensional tensor will first be flattened
+                              into a 2-dimensional matrix. The parameter
+                              `num_flatten_dims` determines how the input tensor
+                              is flattened: the first `num_flatten_dims`
+                              dimensions will be flatten to form the first
+                              dimension of the final matrix (height of the
+                              matrix), and the rest `rank(X) - num_flatten_dims`
+                              dimensions are flattened to form the second
+                              dimension of the final matrix (width of the matrix).
+                              For example, suppose `X` is a 6-dimensional tensor
+                              with a shape [2, 3, 4, 5, 6], and
+                              `num_flatten_dims` = 3. Then, the flattened matrix
+                              will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
+                              By default, `num_flatten_dims` is set to 1.
+       param_attr(ParamAttr|list): The parameter attribute for learnable
+                                   parameters/weights of the fully connected
+                                   layer.
+       param_initializer(ParamAttr|list): The initializer used for the
+                                          weight/parameter. If set None,
+                                          XavierInitializer() will be used.
+       bias_attr(ParamAttr|list): The parameter attribute for the bias parameter
+                                  for this layer. If set None, no bias will be
+                                  added to the output units.
+       bias_initializer(ParamAttr|list): The initializer used for the bias.
+                                        If set None, then ConstantInitializer()
+                                        will be used.
+       act(str): Activation to be applied to the output of the fully connected
+                 layer.
+       name(str): Name/alias of the fully connected layer.
+
+
+    Returns:
+        Variable: The output tensor variable.
+
+    Raises:
+        ValueError: If rank of the input tensor is less than 2.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+    """
+
+    helper = LayerHelper("fc", **locals())
+
+    dtype = helper.input_dtype()
+
+    mul_results = []
+    for input_var, param_attr in helper.iter_inputs_and_params():
+        input_shape = input_var.shape
+        param_shape = [
+            reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
+        ] + [size]
+        w = helper.create_parameter(
+            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
+        tmp = helper.create_tmp_variable(dtype)
+        helper.append_op(
+            type="mul",
+            inputs={
+                "X": input_var,
+                "Y": w,
+            },
+            outputs={"Out": tmp},
+            attrs={"x_num_col_dims": num_flatten_dims,
+                   "y_num_col_dims": 1})
+        mul_results.append(tmp)
+
+    # sum
+    if len(mul_results) == 1:
+        pre_bias = mul_results[0]
+    else:
+        pre_bias = helper.create_tmp_variable(dtype)
+        helper.append_op(
+            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
+    # add bias
+    pre_activation = helper.append_bias_op(pre_bias)
+    # add activation
+    return helper.append_activation(pre_activation)
+
+
+def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'):
+    """
+    **Embedding Layer**
+
+    This layer is used to lookup a vector of IDs, provided by *input*, in a lookup table.
+    The result of this lookup is the embedding of each ID in the *input*.
+
+    All the input variables are passed in as local variables to the LayerHelper
+    constructor.
+
+    Args:
+       input(Variable): Input to the function
+       size(tuple|list|None): Shape of the look up table parameter
+       is_sparse(bool): Boolean flag that specifying whether the input is sparse
+       param_attr(ParamAttr): Parameters for this layer
+       dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc
+
+    Returns:
+        Variable: The tensor variable storing the embeddings of the \
+                  supplied inputs.
+
+    Examples:
+        .. code-block:: python
+
+          dict_size = len(dataset.ids)
+          data = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
+          fc = fluid.layers.embedding(input=data, size=[dict_size, 16])
+    """
+
+    helper = LayerHelper('embedding', **locals())
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
+    tmp = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='lookup_table',
+        inputs={'Ids': input,
+                'W': w},
+        outputs={'Out': tmp},
+        attrs={'is_sparse': is_sparse})
+    return tmp
+
+
+# TODO(qijun): expose H0 and C0
+def dynamic_lstm(input,
+                 size,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_peepholes=True,
+                 is_reverse=False,
+                 gate_activation='sigmoid',
+                 cell_activation='tanh',
+                 candidate_activation='tanh',
+                 dtype='float32'):
+    helper = LayerHelper('lstm', **locals())
+    size = size / 4
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
+    bias_size = [1, 7 * size]
+    if not use_peepholes:
+        bias_size[1] = 4 * size
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    hidden = helper.create_tmp_variable(dtype)
+    cell = helper.create_tmp_variable(dtype)
+    batch_gate = helper.create_tmp_variable(dtype)
+    batch_cell_pre_act = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='lstm',
+        inputs={'Input': input,
+                'Weight': weight,
+                'Bias': bias},
+        outputs={
+            'Hidden': hidden,
+            'Cell': cell,
+            'BatchGate': batch_gate,
+            'BatchCellPreAct': batch_cell_pre_act
+        },
+        attrs={
+            'use_peepholes': use_peepholes,
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'cell_activation': cell_activation,
+            'candidate_activation': candidate_activation
+        })
+    return hidden, cell
+
+
+def gru_unit(input,
+             hidden,
+             size,
+             weight=None,
+             bias=None,
+             activation='tanh',
+             gate_activation='sigmoid'):
+    """
+    GRU unit layer. The equation of a gru step is:
+
+        .. math::
+            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
+
+            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
+
+            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
+
+            h_t & = dot((1-u_t), m_t) + dot(u_t, h_{t-1})
+
+    The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
+    of the equation above, the :math:`z_t` is split into 3 parts -
+    :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to
+    implement a full GRU unit operator for an input, a fully
+    connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.
+
+    The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
+    of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
+    an intermediate candidate hidden output, which is denoted by :math:`m_t`.
+    This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
+    and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
+
+    Args:
+        input (Variable): The fc transformed input value of current step.
+        hidden (Variable): The hidden value of lstm unit from previous step.
+        size (integer): The input dimension value.
+        weight (ParamAttr): The weight parameters for gru unit. Default: None
+        bias (ParamAttr): The bias parameters for gru unit. Default: None
+        activation (string): The activation type for cell (actNode). Default: 'tanh'
+        gate_activation (string): The activation type for gates (actGate). Default: 'sigmoid'
+
+    Returns:
+        tuple: The hidden value, reset-hidden value and gate values.
+
+    Examples:
+
+        .. code-block:: python
+
+             # assuming we have x_t_data and prev_hidden of size=10
+             x_t = fluid.layers.fc(input=x_t_data, size=30)
+             hidden_val, r_h_val, gate_val = fluid.layers.gru_unit(input=x_t,
+                                                    hidden = prev_hidden)
+
+    """
+    activation_dict = dict(
+        identity=0,
+        sigmoid=1,
+        tanh=2,
+        relu=3, )
+    activation = activation_dict[activation]
+    gate_activation = activation_dict[gate_activation]
+
+    helper = LayerHelper('gru_unit', **locals())
+    dtype = helper.input_dtype()
+    size = size / 3
+
+    # create weight
+    if weight is None:
+        weight = helper.create_parameter(
+            attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
+
+    # create bias
+
+    if bias is None:
+        bias_size = [1, 3 * size]
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    gate = helper.create_tmp_variable(dtype)
+    reset_hidden_pre = helper.create_tmp_variable(dtype)
+    updated_hidden = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='gru_unit',
+        inputs={'Input': input,
+                'HiddenPrev': hidden,
+                'Weight': weight},
+        outputs={
+            'Gate': gate,
+            'ResetHiddenPrev': reset_hidden_pre,
+            'Hidden': updated_hidden,
+        },
+        attrs={
+            'activation': 0,
+            'gate_activation': 1,
+        })
+
+    return updated_hidden, reset_hidden_pre, gate
+
+
+def linear_chain_crf(input, label, param_attr=None):
+    helper = LayerHelper('linear_chain_crf', **locals())
+    size = input.shape[1]
+    transition = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[size + 2, size],
+        dtype=helper.input_dtype())
+    alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
+    emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
+    transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
+    log_likelihood = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='linear_chain_crf',
+        inputs={"Emission": [input],
+                "Transition": transition,
+                "Label": label},
+        outputs={
+            "Alpha": [alpha],
+            "EmissionExps": [emission_exps],
+            "TransitionExps": transition_exps,
+            "LogLikelihood": log_likelihood
+        })
+
+    return log_likelihood
+
+
+def crf_decoding(input, param_attr, label=None):
+    helper = LayerHelper('crf_decoding', **locals())
+    transition = helper.get_parameter(param_attr.name)
+    viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='crf_decoding',
+        inputs={"Emission": [input],
+                "Transition": transition,
+                "Label": label},
+        outputs={"ViterbiPath": [viterbi_path]})
+
+    return viterbi_path
+
+
+def cos_sim(X, Y, **kwargs):
+    """
+    This function performs the cosine similarity between two tensors
+    X and Y and returns that as the output.
+    """
+    helper = LayerHelper('cos_sim', **kwargs)
+    out = helper.create_tmp_variable(dtype=X.dtype)
+    xnorm = helper.create_tmp_variable(dtype=X.dtype)
+    ynorm = helper.create_tmp_variable(dtype=X.dtype)
+    helper.append_op(
+        type='cos_sim',
+        inputs={'X': [X],
+                'Y': [Y]},
+        outputs={'Out': [out],
+                 'XNorm': [xnorm],
+                 'YNorm': [ynorm]})
+    return out
+
+
+def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs):
+    helper = LayerHelper('dropout', **kwargs)
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
+    helper.append_op(
+        type='dropout',
+        inputs={'X': [x]},
+        outputs={'Out': [out],
+                 'Mask': [mask]},
+        attrs={'dropout_prob': dropout_prob,
+               'is_test': is_test,
+               'seed': seed})
+    return out
+
+
+def cross_entropy(input, label, **kwargs):
+    """
+    **Cross Entropy Layer**
+
+    This layer computes the cross entropy between `input` and `label`. It supports
+    both standard cross-entropy and soft-label cross-entropy loss computation.
+
+    1) One-hot cross-entropy:
+	`soft_label = False`, `Label[i, 0]` indicates the class index for sample i:
+
+        .. math::
+
+            Y[i] = -\log(X[i, Label[i]])
+
+    2) Soft-label cross-entropy:
+	`soft_label = True`, `Label[i, j]` indicates the soft label of class j
+	for sample i:
+
+        .. math::
+
+            Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
+
+       Please make sure that in this case the summation of each row of `label`
+       equals one.
+
+    3) One-hot cross-entropy with vecterized `label`:
+	 As a special case of 2), when each row of 'label' has only one
+	 non-zero element which is equal to 1, soft-label cross-entropy degenerates
+         to a one-hot cross-entropy with one-hot label representation.
+
+    Args:
+        input (Variable|list):  a 2-D tensor with shape [N x D], where N is the
+            batch size and D is the number of classes. This input is a probability
+            computed by the previous operator, which is almost always the result
+            of a softmax operator.
+        label (Variable|list): the ground truth which is a 2-D tensor. When
+              `soft_label` is set to `False`, `label` is a tensor<int64> with shape
+              [N x 1]. When `soft_label` is set to `True`, `label` is a
+              tensor<float/double> with shape [N x D].
+        soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate
+              the given labels as soft labels, default `False`.
+
+    Returns:
+         A 2-D tensor with shape [N x 1], the cross entropy loss.
+
+    Raises:
+        `ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \
+              `soft_label == True`, and the 2nd dimension of `input` and `label` are not \
+               equal; 3) when `soft_label == False`, and the 2nd dimension of `label` is not 1.
+
+    Examples:
+        .. code-block:: python
+
+          predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+          cost = fluid.layers.cross_entropy(input=predict, label=label)
+    """
+    helper = LayerHelper('cross_entropy', **kwargs)
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='cross_entropy',
+        inputs={'X': [input],
+                'Label': [label]},
+        outputs={'Y': [out]},
+        attrs=kwargs)
+    return out
+
+
+def square_error_cost(input, label, **kwargs):
+    """
+    **Square error cost layer**
+
+    This layer accepts input predictions and target label and returns the squared error cost.
+    For predictions, :math:`X`, and target labels, :math:`Y`, the equation is:
+
+    .. math::
+
+        Out = (X - Y)^2
+
+    In the above equation:
+
+        * :math:`X`: Input predictions, a tensor.
+        * :math:`Y`: Input labels, a tensor.
+        * :math:`Out`: Output value, same shape with :math:`X`.
+
+    Args:
+       input(Variable): Input tensor, has predictions.
+       label(Variable): Label tensor, has target labels.
+
+    Returns:
+        Variable: The tensor variable storing the element-wise squared error difference \
+                  of input and label.
+
+    Examples:
+        .. code-block:: python
+
+          y = layers.data(name='y', shape=[1], dtype='float32')
+          y_predict = layers.data(name='y_predict', shape=[1], dtype='float32')
+          cost = layers.square_error_cost(input=y_predict, label=y)
+
+    """
+    helper = LayerHelper('square_error_cost', **kwargs)
+    minus_out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='elementwise_sub',
+        inputs={'X': [input],
+                'Y': [label]},
+        outputs={'Out': [minus_out]})
+
+    square_out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='square', inputs={'X': [minus_out]},
+        outputs={'Out': [square_out]})
+    return square_out
+
+
+def accuracy(input, label, k=1, correct=None, total=None, **kwargs):
+    """
+    This function computes the accuracy using the input and label.
+    The output is the top_k inputs and their indices.
+    """
+    helper = LayerHelper("accuracy", **kwargs)
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [topk_out],
+                 "Indices": [topk_indices]},
+        attrs={"k": k})
+    acc_out = helper.create_tmp_variable(dtype="float32")
+    if correct is None:
+        correct = helper.create_tmp_variable(dtype="int64")
+    if total is None:
+        total = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="accuracy",
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
+        outputs={
+            "Accuracy": [acc_out],
+            "Correct": [correct],
+            "Total": [total],
+        })
+    return acc_out
+
+
+def chunk_eval(input,
+               label,
+               chunk_scheme,
+               num_chunk_types,
+               excluded_chunk_types=None,
+               **kwargs):
+    """
+    This function computes and outputs the precision, recall and
+    F1-score of chunk detection.
+    """
+    helper = LayerHelper("chunk_eval", **kwargs)
+
+    # prepare output
+    precision = helper.create_tmp_variable(dtype="float32")
+    recall = helper.create_tmp_variable(dtype="float32")
+    f1_score = helper.create_tmp_variable(dtype="float32")
+    num_infer_chunks = helper.create_tmp_variable(dtype="int64")
+    num_label_chunks = helper.create_tmp_variable(dtype="int64")
+    num_correct_chunks = helper.create_tmp_variable(dtype="int64")
+
+    helper.append_op(
+        type="chunk_eval",
+        inputs={"Inference": [input],
+                "Label": [label]},
+        outputs={
+            "Precision": [precision],
+            "Recall": [recall],
+            "F1-Score": [f1_score],
+            "NumInferChunks": [num_infer_chunks],
+            "NumLabelChunks": [num_label_chunks],
+            "NumCorrectChunks": [num_correct_chunks]
+        },
+        attrs={
+            "num_chunk_types": num_chunk_types,
+            "chunk_scheme": chunk_scheme,
+            "excluded_chunk_types": excluded_chunk_types or []
+        })
+    return precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks
+
+
+def sequence_conv(input,
+                  num_filters,
+                  filter_size=3,
+                  filter_stride=1,
+                  padding=None,
+                  bias_attr=None,
+                  param_attr=None,
+                  act=None):
+    """
+    This function creates the op for sequence_conv, using the inputs and
+    other convolutional configurations for the filters and stride as given
+    in the input parameters to the function.
+    """
+
+    # FIXME(dzh) : want to unify the argument of python layer
+    # function. So we ignore some unecessary attributes.
+    # such as, padding_trainable, context_start.
+
+    helper = LayerHelper('sequence_conv', **locals())
+    dtype = helper.input_dtype()
+    filter_shape = [filter_size * input.shape[1], num_filters]
+    filter_param = helper.create_parameter(
+        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
+    pre_bias = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='sequence_conv',
+        inputs={
+            'X': [input],
+            'Filter': [filter_param],
+        },
+        outputs={"Out": pre_bias},
+        attrs={
+            'contextStride': filter_stride,
+            'contextStart': -int(filter_size / 2),
+            'contextLength': filter_size
+        })
+    pre_act = helper.append_bias_op(pre_bias)
+    return helper.append_activation(pre_act)
+
+
+def conv2d(input,
+           num_filters,
+           filter_size,
+           stride=None,
+           padding=None,
+           groups=None,
+           param_attr=None,
+           bias_attr=None,
+           act=None):
+    """
+    **Convlution2D Layer**
+
+    The convolution2D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input(Input) and Output(Output)
+    are in NCHW format. Where N is batch size, C is the number of channels, H is the height
+    of the feature, and W is the width of the feature.
+    The details of convolution layer, please refer UFLDL's `convolution,
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
+    If bias attribution and activation type are provided, bias is added to the output of the convolution,
+    and the corresponding activation function is applied to the final result.
+    For each input :math:`X`, the equation is:
+
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    In the above equation:
+
+        * :math:`X`: Input value, a tensor with NCHW format.
+        * :math:`W`: Filter value, a tensor with MCHW format.
+        * :math:`\\ast`: Convolution operation.
+        * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+        * :math:`\\sigma`: Activation function.
+        * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        Input:
+            Input shape: $(N, C_{in}, H_{in}, W_{in})$
+
+            Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+
+        Output:
+            Output shape: $(N, C_{out}, H_{out}, W_{out})$
+        Where
+    .. math::
+
+        H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+        W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+
+    Args:
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        groups(int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None
+        bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        act(str): Activation type. Default: None
+
+    Returns:
+        Variable: The tensor variable storing the convolution and \
+                  non-linearity activation result.
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and groups mismatch.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
+    """
+
+    if stride is None:
+        stride = [1, 1]
+    helper = LayerHelper('conv2d', **locals())
+    dtype = helper.input_dtype()
+
+    num_channels = input.shape[1]
+    if groups is None:
+        num_filter_channels = num_channels
+    else:
+        if num_channels % groups != 0:
+            raise ValueError("num_channels must be divisible by groups.")
+        num_filter_channels = num_channels / groups
+
+    if isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+    if isinstance(stride, int):
+        stride = [stride, stride]
+    if isinstance(padding, int):
+        padding = [padding, padding]
+
+    input_shape = input.shape
+    filter_shape = [num_filters, num_filter_channels] + filter_size
+
+    def _get_default_param_initializer():
+        std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
+        return Normal(0.0, std, 0)
+
+    filter_param = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=filter_shape,
+        dtype=dtype,
+        default_initializer=_get_default_param_initializer())
+
+    pre_bias = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='conv2d',
+        inputs={
+            'Input': input,
+            'Filter': filter_param,
+        },
+        outputs={"Output": pre_bias},
+        attrs={'strides': stride,
+               'paddings': padding,
+               'groups': groups})
+
+    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
+
+    return helper.append_activation(pre_act)
+
+
+def sequence_pool(input, pool_type, **kwargs):
+    """
+    This function add the operator for sequence pooling.
+    It pools features of all time-steps of each instance, and is applied
+    on top of the input using pool_type mentioned in the parameters.
+
+    It supports four pool_type:
+
+    - average: :math:`Out[i] = \\frac{\sum_i X_i}{N}`
+    - sum:     :math:`Out[i] = \sum_jX_{ij}`
+    - sqrt:    :math:`Out[i] = \\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}`
+    - max:     :math:`Out[i] = max(X_i)`
+
+    .. code-block:: text
+
+       x is a 1-level LoDTensor:
+         x.lod = [[0, 2, 5, 7]]
+         x.data = [1, 3, 2, 4, 6, 5, 1]
+         x.dims = [7, 1]
+
+       then output is a Tensor:
+         out.dim = [3, 1]
+         with condition len(x.lod[-1]) - 1 == out.dims[0]
+
+       for different pool_type:
+         average: out.data = [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
+         sum    : out.data = [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
+         sqrt   : out.data = [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
+                    6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
+         max    : out.data = [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
+
+    Args:
+        input(variable): The input variable which is a LoDTensor.
+        pool_type (string): The pooling type of sequence_pool.
+            It supports average, sum, sqrt and max.
+
+    Returns:
+        The sequence pooling variable which is a Tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+             x = fluid.layers.data(name='x', shape=[7, 1],
+                              dtype='float32', lod_level=1)
+             avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
+             sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
+             sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt')
+             max_x = fluid.layers.sequence_pool(input=x, pool_type='max')
+    """
+    helper = LayerHelper('sequence_pool', input=input, **kwargs)
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+    max_index = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="sequence_pool",
+        inputs={"X": input},
+        outputs={"Out": pool_out,
+                 "MaxIndex": max_index},
+        attrs={"pooltype": pool_type.upper()})
+
+    # when pool_type is max, variable max_index is initialized,
+    # so we stop the gradient explicitly here
+    if pool_type == 'max':
+        max_index.stop_gradient = True
+
+    return pool_out
+
+
+def sequence_first_step(input, **kwargs):
+    """
+    This funciton get the first step of sequence.
+
+    .. code-block:: text
+
+       x is a 1-level LoDTensor:
+         x.lod = [[0, 2, 5, 7]]
+         x.data = [1, 3, 2, 4, 6, 5, 1]
+         x.dims = [7, 1]
+
+       then output is a Tensor:
+         out.dim = [3, 1]
+         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         out.data = [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
+
+    Args:
+        input(variable): The input variable which is a LoDTensor.
+
+    Returns:
+        The sequence's first step variable which is a Tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+             x = fluid.layers.data(name='x', shape=[7, 1],
+                              dtype='float32', lod_level=1)
+             x_first_step = fluid.layers.sequence_first_step(input=x)
+    """
+    return sequence_pool(input=input, pool_type="first")
+
+
+def sequence_last_step(input, **kwargs):
+    """
+    This funciton get the last step of sequence.
+
+    .. code-block:: text
+
+       x is a 1-level LoDTensor:
+         x.lod = [[0, 2, 5, 7]]
+         x.data = [1, 3, 2, 4, 6, 5, 1]
+         x.dims = [7, 1]
+
+       then output is a Tensor:
+         out.dim = [3, 1]
+         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
+
+    Args:
+        input(variable): The input variable which is a LoDTensor.
+
+    Returns:
+        The sequence's last step variable which is a Tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+             x = fluid.layers.data(name='x', shape=[7, 1],
+                              dtype='float32', lod_level=1)
+             x_last_step = fluid.layers.sequence_last_step(input=x)
+    """
+    return sequence_pool(input=input, pool_type="last")
+
+
+def pool2d(input,
+           pool_size,
+           pool_type,
+           pool_stride=None,
+           pool_padding=None,
+           global_pooling=False):
+    """
+    This function adds the operator for pooling in 2 dimensions, using the
+    pooling configurations mentioned in input parameters.
+    """
+    if pool_padding is None:
+        pool_padding = [0, 0]
+    if pool_stride is None:
+        pool_stride = [1, 1]
+    if pool_type not in ["max", "avg"]:
+        raise ValueError(
+            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            str(pool_type))
+    if isinstance(pool_size, int):
+        pool_size = [pool_size, pool_size]
+    if isinstance(pool_stride, int):
+        pool_stride = [pool_stride, pool_stride]
+    if isinstance(pool_padding, int):
+        pool_padding = [pool_padding, pool_padding]
+
+    helper = LayerHelper('pool2d', **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="pool2d",
+        inputs={"X": input},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "global_pooling": global_pooling,
+            "strides": pool_stride,
+            "paddings": pool_padding
+        })
+
+    return pool_out
+
+
+def batch_norm(input,
+               act=None,
+               is_test=False,
+               momentum=0.9,
+               epsilon=1e-05,
+               param_attr=None,
+               bias_attr=None,
+               data_layout='NCHW'):
+    """
+    This function helps create an operator to implement
+    the BatchNorm layer using the configurations from the input parameters.
+    """
+    helper = LayerHelper('batch_norm', **locals())
+    dtype = helper.input_dtype()
+
+    input_shape = input.shape
+    if data_layout == 'NCHW':
+        channel_num = input_shape[1]
+    else:
+        if data_layout == 'NHWC':
+            channel_num = input_shape[-1]
+        else:
+            raise ValueError("unsupported data layout:" + data_layout)
+
+    param_shape = [channel_num]
+
+    # create parameter
+    scale = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=dtype,
+        default_initializer=Constant(1.0))
+
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+
+    mean = helper.create_global_variable(
+        dtype=input.dtype,
+        shape=param_shape,
+        persistable=True,
+        stop_gradient=True)
+    helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
+
+    variance = helper.create_global_variable(
+        dtype=input.dtype,
+        shape=param_shape,
+        persistable=True,
+        stop_gradient=True)
+    helper.set_variable_initializer(var=variance, initializer=Constant(1.0))
+
+    # create output
+    # mean and mean_out share the same memory
+    mean_out = mean
+    # variance and variance out share the same memory
+    variance_out = variance
+    saved_mean = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    saved_variance = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+
+    batch_norm_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="batch_norm",
+        inputs={
+            "X": input,
+            "Scale": scale,
+            "Bias": bias,
+            "Mean": mean,
+            "Variance": variance
+        },
+        outputs={
+            "Y": batch_norm_out,
+            "MeanOut": mean_out,
+            "VarianceOut": variance_out,
+            "SavedMean": saved_mean,
+            "SavedVariance": saved_variance
+        },
+        attrs={"momentum": momentum,
+               "epsilon": epsilon,
+               "is_test": is_test})
+
+    return helper.append_activation(batch_norm_out)
+
+
+def beam_search_decode(ids, scores):
+    helper = LayerHelper('beam_search_decode', **locals())
+    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
+    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
+
+    helper.append_op(
+        type="beam_search_decode",
+        inputs={"Ids": ids,
+                "Scores": scores},
+        outputs={
+            "SentenceIds": sentence_ids,
+            "SentenceScores": sentence_scores
+        })
+
+    return sentence_ids, sentence_scores
+
+
+def conv2d_transpose(input,
+                     num_filters,
+                     output_size=None,
+                     filter_size=None,
+                     padding=None,
+                     stride=None,
+                     dilation=None,
+                     param_attr=None):
+    """
+    The transpose of conv2d layer.
+
+    This layer is also known as deconvolution layer.
+
+    Args:
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). This
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.  None if use output size to
+            calculate filter_size
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride.
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation.
+        param_attr: Parameter Attribute.
+        main_program(Program): the main program
+        startup_program(Program): the startup program
+
+    Returns:
+        Variable: Output image.
+    """
+    helper = LayerHelper("conv2d_transpose", **locals())
+    if not isinstance(input, Variable):
+        raise TypeError("Input of conv2d_transpose must be Variable")
+    input_channel = input.shape[1]
+
+    op_attr = dict()
+
+    if isinstance(padding, int):
+        op_attr['paddings'] = [padding, padding]
+    elif padding is not None:
+        op_attr['paddings'] = padding
+
+    if isinstance(stride, int):
+        op_attr['strides'] = [stride, stride]
+    elif stride is not None:
+        op_attr['strides'] = stride
+
+    if isinstance(dilation, int):
+        op_attr['dilations'] = [dilation, dilation]
+    elif dilation is not None:
+        op_attr['dilations'] = dilation
+
+    if filter_size is None:
+        if output_size is None:
+            raise ValueError("output_size must be set when filter_size is None")
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]
+
+        padding = op_attr.get('paddings', [0, 0])
+        stride = op_attr.get('strides', [1, 1])
+        dilation = op_attr.get('dilations', [1, 1])
+
+        h_in = input.shape[2]
+        w_in = input.shape[3]
+
+        filter_size_h = (output_size[0] - (h_in - 1) * stride[0] + 2 *
+                         padding[0] - 1) / dilation[0] + 1
+        filter_size_w = (output_size[1] - (w_in - 1) * stride[1] + 2 *
+                         padding[1] - 1) / dilation[1] + 1
+        filter_size = [filter_size_h, filter_size_w]
+
+    elif isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+
+    filter_shape = [input_channel, num_filters] + filter_size
+    img_filter = helper.create_parameter(
+        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
+
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='conv2d_transpose',
+        inputs={'Input': [input],
+                'Filter': [img_filter]},
+        outputs={'Output': out},
+        attrs=op_attr)
+
+    return out
+
+
+def sequence_expand(x, y):
+    """Sequence Expand Layer. This layer will expand the input variable **x**
+    according to LoD information of **y**. And the following examples will
+    explain how sequence_expand works:
+
+    .. code-block:: text
+
+        * Case 1
+            x is a LoDTensor:
+                x.lod = [[0,       2, 3],
+                         [0, 1,    3, 4]]
+                x.data = [a, b, c, d]
+                x.dims = [4, 1]
+
+            y is a LoDTensor:
+                y.lod = [[0,    2,    4],
+                         [0, 3, 6, 7, 8]]
+
+            with condition len(y.lod[-1]) - 1 == x.dims[0]
+
+            then output is a 2-level LoDTensor:
+                out.lod = [[0,                2,    4],
+                           [0,       3,       6, 7, 8]]
+                out.data = [a, a, a, b, b, b, c, d]
+                out.dims = [8, 1]
+
+        * Case 2
+            x is a Tensor:
+                x.data = [a, b, c]
+                x.dims = [3, 1]
+
+            y is a LoDTensor:
+                y.lod = [[0, 2, 3, 6]]
+
+            with condition len(y.lod[-1]) - 1 == x.dims[0]
+
+            then output is a 1-level LoDTensor:
+                out.lod = [[0,    2, 3,      6]]
+                out.data = [a, a, b, c, c, c]
+                out.dims = [6, 1]
+
+    Args:
+        x (Variable): The input variable which is a Tensor or LoDTensor.
+        y (Variable): The input variable which is a LoDTensor.
+
+    Returns:
+        Variable: The expanded variable which is a LoDTensor.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[10, 20],
+                             dtype='float32', lod_level=1)
+            out = layers.sequence_expand(x=x, y=y)
+    """
+    helper = LayerHelper('sequence_expand', input=x, **locals())
+    dtype = helper.input_dtype()
+    tmp = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='sequence_expand', inputs={'X': x,
+                                        'Y': y}, outputs={'Out': tmp})
+    return tmp
+
+
+def lstm_unit(x_t,
+              hidden_t_prev,
+              cell_t_prev,
+              forget_bias=0.0,
+              param_attr=None,
+              bias_attr=None):
+    """Lstm unit layer. The equation of a lstm step is:
+
+        .. math::
+
+            i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i)
+
+            f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + b_f)
+
+            c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t + W_{h_c}h_{t-1} + b_c)
+
+            o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + b_o)
+
+            h_t & = o_t tanh(c_t)
+
+    The inputs of lstm unit include :math:`x_t`, :math:`h_{t-1}` and
+    :math:`c_{t-1}`. The 2nd dimensions of :math:`h_{t-1}` and :math:`c_{t-1}`
+    should be same. The implementation separates the linear transformation and
+    non-linear transformation apart. Here, we take :math:`i_t` as an example.
+    The linear transformation is applied by calling a `fc` layer and the
+    equation is:
+
+        .. math::
+
+            L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i
+
+    The non-linear transformation is applied by calling `lstm_unit_op` and the
+    equation is:
+
+        .. math::
+
+            i_t = \sigma(L_{i_t})
+
+    This layer has two outputs including :math:`h_t` and :math:`o_t`.
+
+    Args:
+        x_t (Variable): The input value of current step, a 2-D tensor with shape
+            M x N, M for batch size and N for input size.
+        hidden_t_prev (Variable): The hidden value of lstm unit, a 2-D tensor
+            with shape M x S, M for batch size and S for size of lstm unit.
+        cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor with
+            shape M x S, M for batch size and S for size of lstm unit.
+        forget_bias (float): The forget bias of lstm unit.
+        param_attr (ParamAttr): The attributes of parameter weights, used to set
+            initializer, name etc.
+        bias_attr (ParamAttr): The attributes of bias weights, if not False,
+            bias weights will be created and be set to default value.
+
+    Returns:
+        tuple: The hidden value and cell value of lstm unit.
+
+    Raises:
+        ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\
+                not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** \
+                and **cell_t_prev** not be the same or the 2nd dimensions of \
+                **hidden_t_prev** and **cell_t_prev** not be the same.
+
+    Examples:
+
+        .. code-block:: python
+
+             x_t = fluid.layers.fc(input=x_t_data, size=10)
+             prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=30)
+             prev_cell = fluid.layers.fc(input=prev_cell_data, size=30)
+             hidden_value, cell_value = fluid.layers.lstm_unit(x_t=x_t,
+                                                    hidden_t_prev=prev_hidden,
+                                                    cell_t_prev=prev_cell)
+    """
+    helper = LayerHelper('lstm_unit', **locals())
+
+    if len(x_t.shape) != 2:
+        raise ValueError("Rank of x_t must be 2.")
+
+    if len(hidden_t_prev.shape) != 2:
+        raise ValueError("Rank of hidden_t_prev must be 2.")
+
+    if len(cell_t_prev.shape) != 2:
+        raise ValueError("Rank of cell_t_prev must be 2.")
+
+    if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[
+            0] != cell_t_prev.shape[0]:
+        raise ValueError("The 1st dimensions of x_t, hidden_t_prev and "
+                         "cell_t_prev must be the same.")
+
+    if hidden_t_prev.shape[1] != cell_t_prev.shape[1]:
+        raise ValueError("The 2nd dimensions of hidden_t_prev and "
+                         "cell_t_prev must be the same.")
+
+    if bias_attr is None:
+        bias_attr = ParamAttr()
+
+    size = cell_t_prev.shape[1]
+    concat_out = concat(input=[x_t, hidden_t_prev], axis=1)
+    fc_out = fc(input=concat_out,
+                size=4 * size,
+                param_attr=param_attr,
+                bias_attr=bias_attr)
+    dtype = x_t.dtype
+    c = helper.create_tmp_variable(dtype)
+    h = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='lstm_unit',
+        inputs={"X": fc_out,
+                "C_prev": cell_t_prev},
+        outputs={"C": c,
+                 "H": h},
+        attrs={"forget_bias": forget_bias})
+
+    return h, c
+
+
+def reduce_sum(input, dim=None, keep_dim=False):
+    """
+    Computes the sum of tensor elements over the given dimension.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int|None): The dimension along which the sum is performed. If
+            :attr:`None`, sum all elements of :attr:`input` and return a
+            Tensor variable with a single element, otherwise must be in the
+            range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`,
+            the dimension to reduce is :math:`rank + dim`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_sum(x)  # [3.5]
+            fluid.layers.reduce_sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
+            fluid.layers.reduce_sum(x, dim=-1)  # [1.9, 1.6]
+            fluid.layers.reduce_sum(x, dim=1, keep_dim=True)  # [[1.9], [1.6]]
+    """
+    helper = LayerHelper('reduce_sum', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_sum',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else 0,
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
+def reduce_mean(input, dim=None, keep_dim=False):
+    """
+    Computes the mean of tensor elements over the given dimension.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int|None): The dimension along which the mean is computed. If
+            :attr:`None`, compute the mean over all elements of :attr:`input`
+            and return a Tensor variable with a single element, otherwise
+            must be in the range :math:`[-rank(input), rank(input))`. If
+            :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_mean(x)  # [0.4375]
+            fluid.layers.reduce_mean(x, dim=0)  # [0.15, 0.25, 0.55, 0.8]
+            fluid.layers.reduce_mean(x, dim=-1)  # [0.475, 0.4]
+            fluid.layers.reduce_mean(x, dim=1, keep_dim=True)  # [[0.475], [0.4]]
+    """
+    helper = LayerHelper('reduce_mean', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_mean',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else 0,
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
+def reduce_max(input, dim=None, keep_dim=False):
+    """
+    Computes the maximum of tensor elements over the given dimension.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int|None): The dimension along which the maximum is computed.
+            If :attr:`None`, compute the maximum over all elements of
+            :attr:`input` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-rank(input), rank(input))`.
+            If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_max(x)  # [0.9]
+            fluid.layers.reduce_max(x, dim=0)  # [0.2, 0.3, 0.6, 0.9]
+            fluid.layers.reduce_max(x, dim=-1)  # [0.9, 0.7]
+            fluid.layers.reduce_max(x, dim=1, keep_dim=True)  # [[0.9], [0.7]]
+    """
+    helper = LayerHelper('reduce_max', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_max',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else 0,
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
+def reduce_min(input, dim=None, keep_dim=False):
+    """
+    Computes the minimum of tensor elements over the given dimension.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int|None): The dimension along which the minimum is computed.
+            If :attr:`None`, compute the minimum over all elements of
+            :attr:`input` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-rank(input), rank(input))`.
+            If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_min(x)  # [0.1]
+            fluid.layers.reduce_min(x, dim=0)  # [0.1, 0.2, 0.5, 0.7]
+            fluid.layers.reduce_min(x, dim=-1)  # [0.2, 0.1]
+            fluid.layers.reduce_min(x, dim=1, keep_dim=True)  # [[0.2], [0.1]]
+    """
+    helper = LayerHelper('reduce_min', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_min',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else 0,
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
+def split(input, num_or_sections, dim=-1):
+    """
+    Splits the tensor into multiple sub-tensors.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        num_or_sections (int|list): If :attr:`num_or_sections` is an integer, 
+            then the integer indicates the number of equal sized sub-tensors 
+            that the tensor will be divided into. If :attr:`num_or_sections` 
+            is a list of integers, the length of list indicates the number of 
+            sub-tensors and the integers indicate the sizes of sub-tensors' 
+            :attr:`dim` dimension orderly.
+        dim (int): The dimension along which to split. If :math:`dim < 0`, the 
+            dimension to split along is :math:`rank(input) + dim`.
+
+    Returns:
+        List: The list of segmented tensor variables.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with shape [3, 9, 5]:
+            x0, x1, x2 = fluid.layers.split(x, num_or_sections=3, dim=1)
+            x0.shape  # [3, 3, 5]
+            x1.shape  # [3, 3, 5]
+            x2.shape  # [3, 3, 5]
+            x0, x1, x2 = fluid.layers.split(x, num_or_sections=[2, 3, 4], dim=1)
+            x0.shape  # [3, 2, 5]
+            x1.shape  # [3, 3, 5]
+            x2.shape  # [3, 4, 5]
+    """
+    helper = LayerHelper('split', **locals())
+    input_shape = input.shape
+    dim = (len(input_shape) + dim) if dim < 0 else dim
+    if isinstance(num_or_sections, int):
+        assert num_or_sections > 1, 'num_or_sections must be more than 1.'
+        num = num_or_sections
+    else:
+        assert len(num_or_sections) < input_shape[
+            dim], 'len(num_or_sections) must not be more than input.shape[dim].'
+        num = len(num_or_sections)
+    outs = [
+        helper.create_tmp_variable(dtype=helper.input_dtype())
+        for i in range(num)
+    ]
+    helper.append_op(
+        type='split',
+        inputs={'X': input},
+        outputs={'Out': outs},
+        attrs={
+            'num': num_or_sections if isinstance(num_or_sections, int) else 0,
+            'sections': num_or_sections
+            if isinstance(num_or_sections, list) else [],
+            'axis': dim
+        })
+    return outs
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..73d7c895806ef28ffc98db88809e317a86762769
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -0,0 +1,63 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from ..registry import register_layer
+
+__activations__ = [
+    'sigmoid',
+    'logsigmoid',
+    'exp',
+    'relu',
+    'tanh',
+    'tanh_shrink',
+    'softshrink',
+    'sqrt',
+    'abs',
+    'ceil',
+    'floor',
+    'round',
+    'reciprocal',
+    'log',
+    'square',
+    'softplus',
+    'softsign',
+    'brelu',
+    'leaky_relu',
+    'soft_relu',
+    'elu',
+    'relu6',
+    'pow',
+    'stanh',
+    'hard_shrink',
+    'thresholded_relu',
+    'hard_sigmoid',
+    'swish',
+]
+
+__all__ = [
+    'mean',
+    'mul',
+    'reshape',
+    'scale',
+    'transpose',
+    'sigmoid_cross_entropy_with_logits',
+    'elementwise_add',
+    'elementwise_div',
+    'elementwise_sub',
+    'elementwise_mul',
+    'clip',
+    'sequence_softmax',
+] + __activations__
+
+for _OP in set(__all__):
+    globals()[_OP] = register_layer(_OP)
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..255b9d467839a05447c8999c047d96769ae25f17
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -0,0 +1,331 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from ..layer_helper import LayerHelper
+from ..param_attr import ParamAttr
+from ..framework import convert_np_dtype_to_dtype_
+from ..framework import Variable
+from ..core import DataType
+import numpy
+
+__all__ = [
+    'create_tensor',
+    'create_parameter',
+    'cast',
+    'concat',
+    'sums',
+    'assign',
+    'fill_constant_batch_size_like',
+    'fill_constant',
+    'ones',
+    'zeros',
+]
+
+
+def create_tensor(dtype, name=None):
+    helper = LayerHelper("create_tensor", **locals())
+    return helper.create_variable(name=helper.name, dtype=dtype)
+
+
+def create_parameter(shape,
+                     dtype,
+                     attr=None,
+                     is_bias=False,
+                     default_initializer=None):
+    """
+    Create a parameter
+    Args:
+        shape(list[int]): shape of the parameter
+        dtype(string): element type of the parameter
+        attr(ParamAttr): attributes of the parameter
+        is_bias(bool): This can affect which default initializer is chosen
+                       when default_initializer is None. If is_bias,
+                       initializer.Constant(0.0) will be used. Otherwise,
+                       Xavier() will be used.
+        default_initializer(Initializer): initializer for the parameter
+
+    Returns:
+        Parameter: the created parameter
+    """
+    helper = LayerHelper("create_parameter")
+    if attr is None:
+        attr = ParamAttr()
+    return helper.create_parameter(attr, shape, dtype, is_bias,
+                                   default_initializer)
+
+
+def cast(x, dtype):
+    """
+    This function takes in the input with input_dtype
+    and casts it to the output_dtype as the output.
+    """
+    helper = LayerHelper('cast', **locals())
+    out = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='cast',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={'in_dtype': x.dtype,
+               'out_dtype': out.dtype})
+    return out
+
+
+def concat(input, axis=0):
+    """
+    **Concat**
+
+    This function concatenates the input along the axis mentioned
+    and returns that as the output.
+
+    Args:
+        input(list): List of tensors to be concatenated
+        axis(int): Integer axis along which the tensors will be concatenated
+
+    Returns:
+        Variable: Output variable of the concatenation
+
+    Examples:
+        .. code-block:: python
+          out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    """
+    helper = LayerHelper('concat', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='concat',
+        inputs={'X': input},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
+
+
+def sums(input, out=None):
+    """This function performs the sum operation on the input and returns the
+    result as the output.
+
+    Args:
+        input (Variable|list): The input tensor that has the elements
+                               that need to be summed up.
+
+    Returns:
+        Variable: The tensor type variable that has the sum of input
+                  written to it.
+
+    Examples:
+        .. code-block::python
+
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          a0 = layers.array_read(array=tmp, i=i)
+          i = layers.increment(x=i)
+          a1 = layers.array_read(array=tmp, i=i)
+          mean_a0 = layers.mean(x=a0)
+          mean_a1 = layers.mean(x=a1)
+          a_sum = layers.sums(input=[mean_a0, mean_a1])
+    """
+    helper = LayerHelper('sum', **locals())
+    if out is None:
+        out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
+    return out
+
+
+def assign(input, output):
+    """
+    **Assign**
+
+    This function copies the *input* Variable to the *output* Variable.
+
+    Args:
+        input(Variable|numpy.ndarray): The source variable
+        output(Variable): The destination variable
+
+    Returns:
+        Variable: The destination variable that was supplied as the *output*.
+
+    Examples:
+        .. code-block:: python
+          out = fluid.layers.create_tensor(dtype='float32')
+          hidden = fluid.layers.fc(input=data, size=10)
+          fluid.layers.assign(hidden, out)
+    """
+    helper = LayerHelper('assign', **locals())
+    if isinstance(input, Variable):
+        helper.append_op(
+            type='scale',
+            inputs={'X': [input]},
+            outputs={'Out': [output]},
+            attrs={'scale': 1.0})
+    elif isinstance(input, numpy.ndarray):
+        dtype = convert_np_dtype_to_dtype_(input.dtype)
+        if dtype == DataType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in input.flat]
+        elif dtype == DataType.INT32:
+            value_name = "int32_values"
+            values = [int(v) for v in input.flat]
+        else:
+            raise ValueError("Unsupported dtype %s", input.dtype)
+        if input.size > 1024 * 1024:
+            raise ValueError("The size of input is too big. Please consider "
+                             "saving it to file and 'load_op' to load it")
+
+        helper.append_op(
+            type='assign_value',
+            outputs={'Out': [output]},
+            attrs={
+                'dtype': dtype,
+                'shape': list(input.shape),
+                value_name: values
+            })
+    else:
+        raise ValueError("Wrong type for assign input: %s" % type(input))
+
+    return output
+
+
+def fill_constant(shape, dtype, value, force_cpu=False, out=None):
+    """
+    **fill_constant**
+
+    This function creates a tensor with specified `shape` and `dtype`, and
+    initializes it with a constant specifed by `value`.
+
+    The attribute `stop_gradient` of the created tensor is set to True.
+
+    Args:
+        shape(tuple|list|None): Shape of the output tensor.
+        dtype(np.dtype|core.DataType|str): Data type of the output tensor.
+        value(float): The constant value used to initialize the output tensor.
+        out(Variable): The output tensor.
+
+    Returns:
+        Variable: The tensor variable storing the output.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64')
+    """
+
+    helper = LayerHelper("fill_constant", **locals())
+    if out is None:
+        out = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='fill_constant',
+        inputs={},
+        outputs={'Out': [out]},
+        attrs={
+            'shape': shape,
+            'dtype': out.dtype,
+            'value': float(value),
+            'force_cpu': force_cpu
+        })
+    out.stop_gradient = True
+    return out
+
+
+def fill_constant_batch_size_like(input,
+                                  shape,
+                                  dtype,
+                                  value,
+                                  input_dim_idx=0,
+                                  output_dim_idx=0):
+    """
+    **fill_constant_batch_size_like**
+
+    This function creates a tensor of specified *shape*, *dtype* and batch size,
+    and initializes this with a constant supplied in *value*. The batch size is
+    obtained from the `input` tensor.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        input(Variable): Tensor whose dimensions will be used to get batch size
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+        value(float): Constant value to initialize the output tensor
+        input_dim_idx(int): Index of input's batch size dimension
+        output_dim_idx(int): Index of output's batch size dimension
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.fill_constant_batch_size_like(
+              input=like, shape=[1], value=0, dtype='int64')
+    """
+    helper = LayerHelper("fill_constant_batch_size_like", **locals())
+    out = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='fill_constant_batch_size_like',
+        inputs={'Input': input},
+        outputs={'Out': [out]},
+        attrs={
+            'shape': shape,
+            'dtype': out.dtype,
+            'value': float(value),
+            'input_dim_idx': input_dim_idx,
+            'output_dim_idx': output_dim_idx
+        })
+    out.stop_gradient = True
+    return out
+
+
+def ones(shape, dtype):
+    """
+    **ones**
+
+    This function creates a tensor of specified *shape* and
+    *dtype*, and initializes this with 1.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.ones(shape=[1], dtype='int64')
+    """
+    return fill_constant(value=1.0, **locals())
+
+
+def zeros(shape, dtype):
+    """
+    **zeros**
+
+    This function creates a tensor of specified *shape* and
+    *dtype*, and initializes this with 0.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.zeros(shape=[1], dtype='int64')
+    """
+    return fill_constant(value=0.0, **locals())
diff --git a/python/paddle/v2/fluid/memory_optimization_transpiler.py b/python/paddle/v2/fluid/memory_optimization_transpiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..89ffe26ed1a70942c4994394f9a3635dda13be69
--- /dev/null
+++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py
@@ -0,0 +1,171 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from collections import defaultdict
+import framework
+from framework import Program, default_main_program, Parameter, Variable
+import backward
+from backward import _rename_arg_
+from . import core
+
+dtype_to_size = {
+    core.DataType.FP16: 2,
+    core.DataType.FP32: 4,
+    core.DataType.FP64: 8,
+    core.DataType.INT16: 2,
+    core.DataType.INT32: 4,
+    core.DataType.INT64: 8,
+    core.DataType.BOOL: 1
+}
+
+
+class ControlFlowGraph(object):
+    def __init__(self, Program):
+        self._program = Program
+        self._succesors = defaultdict(set)
+        self._presucessors = defaultdict(set)
+        self._uses = defaultdict(set)
+        self._defs = defaultdict(set)
+        self._live_in = defaultdict(set)
+        self._live_out = defaultdict(set)
+
+    def _add_connections(self, connections):
+        for node1, node2 in connections:
+            self._add(node1, node2)
+
+    def _add(self, node1, node2):
+        self._succesors[node1].add(node2)
+        self._presucessors[node2].add(node1)
+
+    def _build_graph(self):
+        program_desc = self._program.get_desc()
+        block_size = program_desc.num_blocks()
+
+        # TODO(qijun) handle Program with if/while operators
+        self.global_block_desc = program_desc.block(0)
+        self.op_size = self.global_block_desc.op_size()
+
+        op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)]
+        self._add_connections(op_node_connections)
+
+        self.ops = [self.global_block_desc.op(i) for i in range(self.op_size)]
+
+        for i in range(self.op_size):
+            self._uses[i].update(self.ops[i].input_arg_names())
+            self._defs[i].update(self.ops[i].output_arg_names())
+
+    def _update_graph(self, old_name, new_name, begin_idx=0):
+        for i in range(begin_idx, self.op_size):
+            if old_name in self._uses[i]:
+                self._uses[i].remove(old_name)
+                self._uses[i].add(new_name)
+            if old_name in self._defs[i]:
+                self._defs[i].remove(old_name)
+                self._defs[i].add(new_name)
+            if old_name in self._live_in[i]:
+                self._live_in[i].remove(old_name)
+                self._live_out[i].add(new_name)
+            if old_name in self._live_out[i]:
+                self._live_out[i].remove(old_name)
+                self._live_out[i].add(new_name)
+
+    def _reach_fixed_point(self, live_in, live_out):
+        if len(live_in) != len(self._live_in):
+            return False
+        if len(live_out) != len(self._live_out):
+            return False
+        for i in range(self.op_size):
+            if live_in[i] != self._live_in[i]:
+                return False
+        for i in range(self.op_size):
+            if live_out[i] != self._live_out[i]:
+                return False
+        return True
+
+    def _dataflow_analyze(self):
+        self._build_graph()
+        live_in = defaultdict(set)
+        live_out = defaultdict(set)
+        while True:
+            for i in range(self.op_size):
+                live_in[i] = set(self._live_in[i])
+                live_out[i] = set(self._live_out[i])
+                self._live_in[i] = self._uses[i] | (
+                    self._live_out[i] - self._defs[i])
+                for s in self._succesors[i]:
+                    self._live_out[i] |= self._live_in[s]
+
+            if self._reach_fixed_point(live_in, live_out):
+                break
+
+    def _get_diff(self, a, b):
+        u = a & b
+        return a - u, b - u
+
+    def memory_optimize(self):
+        self._build_graph()
+        self._dataflow_analyze()
+        self.pool = []
+        for i in range(self.op_size):
+            if self.pool:
+                out_pair = [(x, self.global_block_desc.var(str(x)).shape())
+                            for x in self._defs[i]]
+                for x, x_shape in out_pair:
+                    if not self.global_block_desc.var(str(x)).persistable():
+                        for index, cache_pair in enumerate(self.pool):
+                            cache_var = cache_pair[0]
+                            cache_shape = cache_pair[1]
+                            if x_shape == cache_shape:
+                                x_dtype = self.global_block_desc.var(str(
+                                    x)).dtype()
+                                cache_dtype = self.global_block_desc.var(
+                                    str(cache_var)).dtype()
+                                # TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
+                                # and dtype_to_size[cache_dtype]
+                                if x_dtype == cache_dtype:
+                                    print(
+                                        ("Hit Cache !!!! cache pool index "
+                                         "is %d, var name is %s, "
+                                         "cached var name is %s, "
+                                         "var shape is %s ") %
+                                        (index, x, cache_var, str(cache_shape)))
+                                    self.pool.pop(index)
+                                    _rename_arg_(
+                                        self.ops, x, cache_var, begin_idx=i)
+                                    self._program.current_block().var(str(
+                                        x)).desc = self.global_block_desc.var(
+                                            str(cache_var))
+                                    self._update_graph(
+                                        x, cache_var, begin_idx=i)
+                                    break
+
+            in_diff, out_diff = self._get_diff(self._live_in[i],
+                                               self._live_out[i])
+            can_optimize = filter(
+                lambda x: not self.global_block_desc.var(str(x)).persistable(),
+                in_diff)
+            if can_optimize:
+                for var_name in can_optimize:
+                    self.pool.append(
+                        (var_name,
+                         self.global_block_desc.var(str(var_name)).shape()))
+
+    def get_program(self):
+        return self._program
+
+
+def memory_optimize(input_program):
+    graph = ControlFlowGraph(input_program)
+    graph.memory_optimize()
+    result_program = graph.get_program()
+    return result_program
diff --git a/python/paddle/v2/fluid/net_drawer.py b/python/paddle/v2/fluid/net_drawer.py
index 94fdd5e38970b309580de6fc934b158a3c46e464..7448975b59ba13bb29cd0a1bea043add251844e6 100644
--- a/python/paddle/v2/fluid/net_drawer.py
+++ b/python/paddle/v2/fluid/net_drawer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import argparse
 import json
 import logging
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py
index 5e14ca594bc7965dc29039ba57bb7b26b1ce6871..b5c26e713db4e5c48d6c0851fd68275aeabeea64 100644
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -1,6 +1,23 @@
-import paddle.v2.fluid.layers as layers
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import layers
 
-__all__ = ["simple_img_conv_pool", "sequence_conv_pool"]
+__all__ = [
+    "simple_img_conv_pool",
+    "sequence_conv_pool",
+    "glu",
+]
 
 
 def simple_img_conv_pool(input,
@@ -9,24 +26,20 @@ def simple_img_conv_pool(input,
                          pool_size,
                          pool_stride,
                          act,
-                         pool_type='max',
-                         main_program=None,
-                         startup_program=None):
+                         param_attr=None,
+                         pool_type='max'):
     conv_out = layers.conv2d(
         input=input,
         num_filters=num_filters,
         filter_size=filter_size,
-        act=act,
-        main_program=main_program,
-        startup_program=startup_program)
+        param_attr=param_attr,
+        act=act)
 
     pool_out = layers.pool2d(
         input=conv_out,
         pool_size=pool_size,
         pool_type=pool_type,
-        pool_stride=pool_stride,
-        main_program=main_program,
-        startup_program=startup_program)
+        pool_stride=pool_stride)
     return pool_out
 
 
@@ -36,12 +49,11 @@ def img_conv_group(input,
                    conv_padding=1,
                    conv_filter_size=3,
                    conv_act=None,
+                   param_attr=None,
                    conv_with_batchnorm=False,
                    conv_batchnorm_drop_rate=None,
                    pool_stride=1,
-                   pool_type=None,
-                   main_program=None,
-                   startup_program=None):
+                   pool_type=None):
     """
     Image Convolution Group, Used for vgg net.
     """
@@ -57,6 +69,7 @@ def img_conv_group(input,
 
     conv_padding = __extend_list__(conv_padding)
     conv_filter_size = __extend_list__(conv_filter_size)
+    param_attr = __extend_list__(param_attr)
     conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
     conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
 
@@ -70,52 +83,70 @@ def img_conv_group(input,
             num_filters=conv_num_filter[i],
             filter_size=conv_filter_size[i],
             padding=conv_padding[i],
-            act=local_conv_act,
-            main_program=main_program,
-            startup_program=startup_program)
+            param_attr=param_attr[i],
+            act=local_conv_act)
 
         if conv_with_batchnorm[i]:
-            tmp = layers.batch_norm(
-                input=tmp,
-                act=conv_act,
-                main_program=main_program,
-                startup_program=startup_program)
+            tmp = layers.batch_norm(input=tmp, act=conv_act)
             drop_rate = conv_batchnorm_drop_rate[i]
             if abs(drop_rate) > 1e-5:
-                tmp = layers.dropout(
-                    x=tmp,
-                    dropout_prob=drop_rate,
-                    main_program=main_program,
-                    startup_program=startup_program)
+                tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
 
     pool_out = layers.pool2d(
         input=tmp,
         pool_size=pool_size,
         pool_type=pool_type,
-        pool_stride=pool_stride,
-        main_program=main_program,
-        startup_program=startup_program)
+        pool_stride=pool_stride)
     return pool_out
 
 
 def sequence_conv_pool(input,
                        num_filters,
                        filter_size,
+                       param_attr=None,
                        act="sigmoid",
-                       pool_type="max",
-                       main_program=None,
-                       startup_program=None):
+                       pool_type="max"):
     conv_out = layers.sequence_conv(
         input=input,
         num_filters=num_filters,
         filter_size=filter_size,
-        act=act,
-        main_program=main_program,
-        startup_program=startup_program)
+        param_attr=param_attr,
+        act=act)
 
-    pool_out = layers.sequence_pool(
-        input=conv_out,
-        pool_type=pool_type,
-        main_program=main_program,
-        startup_program=startup_program)
+    pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type)
     return pool_out
+
+
+def glu(input, dim=-1):
+    """
+    The gated linear unit composed by split, sigmoid activation and elementwise 
+    multiplication. Specifically, Split the input into two equal sized parts 
+    :math:`a` and :math:`b` along the given dimension and then compute as 
+    following:
+
+        .. math::
+
+            {GLU}(a, b)= a \otimes \sigma(b)
+
+    Refer to `Language Modeling with Gated Convolutional Networks 
+    <https://arxiv.org/pdf/1612.08083.pdf>`_.
+    
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int): The dimension along which to split. If :math:`dim < 0`, the 
+            dimension to split along is :math:`rank(input) + dim`.
+
+    Returns:
+        Variable: The Tensor variable with half the size of input.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with shape [3, 6, 9]
+            fluid.nets.glu(input=x, dim=1)  # shape of output: [3, 3, 9]
+    """
+
+    a, b = layers.split(input, num_or_sections=2, dim=dim)
+    act_b = layers.sigmoid(x=b)
+    out = layers.elementwise_mul(x=a, y=act_b)
+    return out
diff --git a/python/paddle/v2/fluid/op.py b/python/paddle/v2/fluid/op.py
index 5828803497ec06bc7644da18ca752f61469ca53f..4bc0f79c64876829d601ac06b9ca451a15300fe8 100644
--- a/python/paddle/v2/fluid/op.py
+++ b/python/paddle/v2/fluid/op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import paddle.v2.fluid.core as core
 import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py
index d2841df6af7a0d860c239db952c767c995d30ba4..8bd62ef0c02b1cae1dca8d6f1414286dc643794b 100644
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -1,16 +1,27 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from collections import defaultdict
 
-import paddle.v2.fluid.framework as framework
-from paddle.v2.fluid.framework import unique_name, Program
-from paddle.v2.fluid.backward import append_backward_ops
-from paddle.v2.fluid.initializer import ConstantInitializer
-from paddle.v2.fluid.regularizer import append_regularization_ops
-from paddle.v2.fluid.layer_helper import LayerHelper
+import framework
+from backward import append_backward
+from framework import unique_name, program_guard
+from initializer import Constant
+from layer_helper import LayerHelper
+from regularizer import append_regularization_ops
+from clip import append_gradient_clip_ops, error_clip_callback
 
-__all__ = [
-    'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
-    'AdamaxOptimizer', 'DecayedAdagradOptimizer'
-]
+__all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad']
 
 
 class Optimizer(object):
@@ -21,8 +32,9 @@ class Optimizer(object):
     but need to use one of it's implementation.
     """
 
-    def __init__(self, global_step=None):
+    def __init__(self, global_step=None, regularization=None):
         self._global_step = global_step
+        self.regularization = regularization
         # Dictionary of accumulators. Some optimizer subclasses need to
         # allocate and manage extra variables associated with the parameters
         # to train. These variables are called accumulators.
@@ -48,7 +60,7 @@ class Optimizer(object):
             persistable=True)
         param_lr = param_lr * self._learning_rate
         self.helper.set_variable_initializer(
-            var=param_lr_var, initializer=ConstantInitializer(param_lr))
+            var=param_lr_var, initializer=Constant(param_lr))
         return param_lr_var
 
     def _create_accumulators(self, block, parameters):
@@ -92,11 +104,11 @@ class Optimizer(object):
         var = self.helper.create_global_variable(
             name=unique_name(name),
             persistable=True,
-            dtype=dtype or param.data_type,
+            dtype=dtype or param.dtype,
             type=param.type,
             shape=param.shape)
         self.helper.set_variable_initializer(
-            var, initializer=ConstantInitializer(value=float(fill_value)))
+            var, initializer=Constant(value=float(fill_value)))
         self._accumulators[name][param.name] = var
 
     def _get_accumulator(self, name, param):
@@ -161,33 +173,32 @@ class Optimizer(object):
 
         # Create any accumulators
         program = loss.block.program
-        self.helper = LayerHelper(
-            self.__class__.__name__,
-            main_program=program,
-            startup_program=startup_program)
-        self._create_accumulators(loss.block,
-                                  [p[0] for p in parameters_and_grads])
-
-        optimize_ops = []
-        for param_and_grad in parameters_and_grads:
-            if param_and_grad[1] is not None:
-                optimize_op = self._append_optimize_op(loss.block,
-                                                       param_and_grad)
-                optimize_ops.append(optimize_op)
-
-        # Returned list of ops can include more ops in addition
-        # to optimization ops
-        return_ops = optimize_ops
-
-        # Get custom finish ops for subclasses
-        # FIXME: Need to fix this once we figure out how to handle dependencies
-        finish_ops = self._finish_update(loss.block)
-        if finish_ops is not None:
-            return_ops += finish_ops
-
-        if self._global_step is not None:
-            return_ops.append(self._increment_global_step(loss.block))
-        return return_ops
+        with program_guard(program, startup_program):
+            self.helper = LayerHelper(self.__class__.__name__)
+            self._create_accumulators(loss.block,
+                                      [p[0] for p in parameters_and_grads])
+
+            optimize_ops = []
+            for param_and_grad in parameters_and_grads:
+                if param_and_grad[0].trainable is True and param_and_grad[
+                        1] is not None:
+                    optimize_op = self._append_optimize_op(loss.block,
+                                                           param_and_grad)
+                    optimize_ops.append(optimize_op)
+
+            # Returned list of ops can include more ops in addition
+            # to optimization ops
+            return_ops = optimize_ops
+
+            # Get custom finish ops for subclasses
+            # FIXME: Need to fix this once we figure out how to handle dependencies
+            finish_ops = self._finish_update(loss.block)
+            if finish_ops is not None:
+                return_ops += finish_ops
+
+            if self._global_step is not None:
+                return_ops.append(self._increment_global_step(loss.block))
+            return return_ops
 
     def minimize(self,
                  loss,
@@ -196,25 +207,30 @@ class Optimizer(object):
                  no_grad_set=None):
         """Add operations to minimize `loss` by updating `parameter_list`.
 
-        This method combines interface `append_backward_ops()` and
+        This method combines interface `append_backward()` and
         `create_optimization_pass()` into one.
         """
-        params_grads = append_backward_ops(loss, parameter_list, no_grad_set or
-                                           set())
-        # Add regularization if any 
-        params_grads = append_regularization_ops(params_grads)
+        params_grads = append_backward(loss, parameter_list, no_grad_set,
+                                       error_clip_callback)
+
+        params_grads = append_gradient_clip_ops(params_grads)
+
+        # Add regularization if any
+        params_grads = append_regularization_ops(params_grads,
+                                                 self.regularization)
+
         optimize_ops = self.create_optimization_pass(params_grads, loss,
                                                      startup_program)
-        return optimize_ops
+        return optimize_ops, params_grads
 
 
 class SGDOptimizer(Optimizer):
     """ Simple SGD optimizer without any state.
     """
 
-    def __init__(self, learning_rate, global_step=None):
+    def __init__(self, learning_rate, **kwargs):
         assert learning_rate is not None
-        super(SGDOptimizer, self).__init__(global_step)
+        super(SGDOptimizer, self).__init__(**kwargs)
         self.type = "sgd"
         self._learning_rate = learning_rate
 
@@ -239,14 +255,10 @@ class MomentumOptimizer(Optimizer):
     """
     _velocity_acc_str = "velocity"
 
-    def __init__(self,
-                 learning_rate,
-                 momentum,
-                 use_nesterov=False,
-                 global_step=None):
+    def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs):
         assert learning_rate is not None
         assert momentum is not None
-        super(MomentumOptimizer, self).__init__(global_step)
+        super(MomentumOptimizer, self).__init__(**kwargs)
         self.type = "momentum"
         self._learning_rate = learning_rate
         self._momentum = momentum
@@ -287,10 +299,10 @@ class AdagradOptimizer(Optimizer):
     """
     _moment_acc_str = "moment"
 
-    def __init__(self, learning_rate, epsilon=1.0e-6, global_step=None):
+    def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs):
         assert learning_rate is not None
         assert epsilon is not None
-        super(AdagradOptimizer, self).__init__(global_step)
+        super(AdagradOptimizer, self).__init__(**kwargs)
         self.type = "adagrad"
         self._learning_rate = learning_rate
         self._epsilon = epsilon
@@ -334,12 +346,12 @@ class AdamOptimizer(Optimizer):
                  beta1=0.9,
                  beta2=0.999,
                  epsilon=1e-8,
-                 global_step=None):
+                 **kwargs):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        super(AdamOptimizer, self).__init__(global_step)
+        super(AdamOptimizer, self).__init__(**kwargs)
         self.type = "adam"
         self._learning_rate = learning_rate
         self._beta1 = beta1
@@ -359,7 +371,7 @@ class AdamOptimizer(Optimizer):
             lod_level=0,
             persistable=True)
         self.helper.set_variable_initializer(
-            self._beta1_pow_acc, initializer=ConstantInitializer(self._beta1))
+            self._beta1_pow_acc, initializer=Constant(self._beta1))
 
         self._beta2_pow_acc = self.helper.create_global_variable(
             name=unique_name('beta2_pow_acc'),
@@ -369,7 +381,7 @@ class AdamOptimizer(Optimizer):
             persistable=True)
 
         self.helper.set_variable_initializer(
-            self._beta2_pow_acc, initializer=ConstantInitializer(self._beta2))
+            self._beta2_pow_acc, initializer=Constant(self._beta2))
 
         # Create accumulator tensors for first and second moments
         for p in parameters:
@@ -439,12 +451,12 @@ class AdamaxOptimizer(Optimizer):
                  beta1=0.9,
                  beta2=0.999,
                  epsilon=1e-8,
-                 global_step=None):
+                 **kwargs):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        super(AdamaxOptimizer, self).__init__()
+        super(AdamaxOptimizer, self).__init__(**kwargs)
         self.type = "adamax"
         self._learning_rate = learning_rate
         self._beta1 = beta1
@@ -461,7 +473,7 @@ class AdamaxOptimizer(Optimizer):
             lod_level=0,
             persistable=True)
         self.helper.set_variable_initializer(
-            self._beta1_pow_acc, initializer=ConstantInitializer(self._beta1))
+            self._beta1_pow_acc, initializer=Constant(self._beta1))
 
         # Create accumulator tensors for first moment and infinity norm
         for p in parameters:
@@ -517,16 +529,12 @@ class DecayedAdagradOptimizer(Optimizer):
     """
     _moment_acc_str = "moment"
 
-    def __init__(self,
-                 learning_rate,
-                 decay=0.95,
-                 epsilon=1.0e-6,
-                 global_step=None):
+    def __init__(self, learning_rate, decay=0.95, epsilon=1.0e-6, **kwargs):
         assert learning_rate is not None
         assert decay is not None
         assert epsilon is not None
 
-        super(DecayedAdagradOptimizer, self).__init__(global_step)
+        super(DecayedAdagradOptimizer, self).__init__(**kwargs)
         self.type = "decayed_adagrad"
         self._learning_rate = learning_rate
         self._decay = decay
@@ -558,3 +566,19 @@ class DecayedAdagradOptimizer(Optimizer):
             attrs={"epsilon": self._epsilon})
 
         return decayed_adagrad_op
+
+
+# We short the class name, since users will use the optimizer with the package
+# name. The sample code:
+#
+# import paddle.fluid as fluid
+#
+# sgd = fluid.optimizer.SGD(...)
+#
+# It is no need to add an `Optimizer` as the class suffix
+SGD = SGDOptimizer
+Momentum = MomentumOptimizer
+Adagrad = AdagradOptimizer
+Adam = AdamOptimizer
+Adamax = AdamaxOptimizer
+DecayedAdagrad = DecayedAdagradOptimizer
diff --git a/python/paddle/v2/fluid/param_attr.py b/python/paddle/v2/fluid/param_attr.py
new file mode 100644
index 0000000000000000000000000000000000000000..3af0190590e775d4816410e4dbe0069868ea209c
--- /dev/null
+++ b/python/paddle/v2/fluid/param_attr.py
@@ -0,0 +1,83 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from initializer import Initializer, Xavier, Constant
+from regularizer import WeightDecayRegularizer
+
+__all__ = ['ParamAttr']
+
+
+class ParamAttr(object):
+    def __init__(self,
+                 name=None,
+                 initializer=None,
+                 learning_rate=1.0,
+                 regularizer=None,
+                 trainable=True,
+                 clip=None):
+        self.name = name
+        self.initializer = initializer
+        self.learning_rate = learning_rate
+        self.regularizer = regularizer
+        self.trainable = trainable
+        self.clip = clip
+
+    def set_default_initializer(self, initializer):
+        if initializer is None:
+            if self.initializer is None:
+                raise ValueError("ParamAttr.initializer is not set")
+            return
+
+        if self.initializer is not None:
+            return
+
+        self.initializer = initializer
+
+    def set_default_param_initializer(self):
+        self.set_default_initializer(Xavier())
+
+    def set_default_bias_initializer(self):
+        self.set_default_initializer(Constant(0.0))
+
+    @staticmethod
+    def to_attr(arg):
+        if arg is None:
+            return ParamAttr()
+        elif isinstance(arg, list) or isinstance(arg, tuple):
+            return [ParamAttr.to_attr(a) for a in arg]
+        elif isinstance(arg, ParamAttr):
+            return arg
+        elif isinstance(arg, str) or isinstance(arg, unicode):
+            return ParamAttr(name=arg)
+        elif isinstance(arg, Initializer):
+            return ParamAttr(initializer=arg)
+        elif isinstance(arg, WeightDecayRegularizer):
+            return ParamAttr(regularizer=arg)
+        elif isinstance(arg, bool):
+            return ParamAttr.to_attr(None) if arg else False
+        else:
+            raise TypeError("{0} cast to ParamAttr".format(type(arg)))
+
+    def to_kwargs(self, with_initializer=False):
+        kwargs = {
+            'name': self.name,
+            'optimize_attr': {
+                'learning_rate': self.learning_rate
+            },
+            'regularizer': self.regularizer,
+            'trainable': self.trainable,
+            'clip_attr': self.clip
+        }
+        if with_initializer:
+            kwargs['initializer'] = self.initializer
+        return kwargs
diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f049498b9ffbfc915d03673c1d009d0ab5e4f8fc
--- /dev/null
+++ b/python/paddle/v2/fluid/profiler.py
@@ -0,0 +1,64 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import paddle.v2.fluid.core as core
+from contextlib import contextmanager
+import os
+
+__all__ = ['CudaProfiler']
+
+NVPROF_CONFIG = [
+    "gpustarttimestamp",
+    "gpuendtimestamp",
+    "gridsize3d",
+    "threadblocksize",
+    "streamid",
+    "enableonstart 0",
+    "conckerneltrace",
+]
+
+
+@contextmanager
+def cuda_profiler(output_file, output_mode=None, config=None):
+    """The CUDA profiler.
+    This fuctions is used to profile CUDA program by CUDA runtime application
+    programming interface. The profiling result will be written into
+    `output_file` with Key-Value pair format or Comma separated values format.
+    The user can set the output mode by `output_mode` argument and set the
+    counters/options for profiling by `config` argument. The default config
+    is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
+    'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
+
+    Args:
+        output_file (string) : The output file name, the result will be
+            written into this file.
+        output_mode (string) : The output mode has Key-Value pair format and
+            Comma separated values format. It should be 'kvp' or 'csv'.
+        config (list of string) : The profiler options and counters can refer
+            to "Compute Command Line Profiler User Guide".
+    """
+    if output_mode is None:
+        output_mode = 'csv'
+    if output_mode not in ['kvp', 'csv']:
+        raise ValueError("The output mode must be 'kvp' or 'csv'.")
+    config = NVPROF_CONFIG if config is None else config
+    config_file = 'nvprof_config_file'
+    with open(config_file, 'wb') as fp:
+        fp.writelines(["%s\n" % item for item in config])
+    core.nvprof_init(output_file, output_mode, config_file)
+    # Enables profiler collection by the active CUDA profiling tool.
+    core.nvprof_start()
+    yield
+    # Disables profiler collection.
+    core.nvprof_stop()
+    os.remove(config_file)
diff --git a/python/paddle/v2/fluid/registry.py b/python/paddle/v2/fluid/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c0c3a35185391873fe5bb98d1ed5ee1cf13aa15
--- /dev/null
+++ b/python/paddle/v2/fluid/registry.py
@@ -0,0 +1,208 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import re
+import cStringIO
+import warnings
+import functools
+import inspect
+
+import proto.framework_pb2 as framework_pb2
+from framework import OpProtoHolder, Variable, Program, Operator
+from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
+
+__all__ = [
+    'deprecated',
+    'register_layer',
+    'autodoc',
+]
+
+
+def _convert_(name):
+    """
+    Formatting.
+
+    Args:
+       name: The name/alias
+
+    This function takes in a name and converts it to a standard format of
+    group1_group2. Where as per the regular expression, group1 can have
+    alphabets and numbers and group2 has capital alphabets.
+
+    """
+    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
+    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
+
+
+def _generate_doc_string_(op_proto):
+    """
+    Generate docstring by OpProto
+
+    Args:
+        op_proto (framework_pb2.OpProto): a protobuf message typed OpProto
+
+    Returns:
+        str: the document string
+    """
+
+    def _type_to_str_(tp):
+        return framework_pb2.AttrType.Name(tp)
+
+    if not isinstance(op_proto, framework_pb2.OpProto):
+        raise TypeError("OpProto should be `framework_pb2.OpProto`")
+
+    buf = cStringIO.StringIO()
+    buf.write(op_proto.comment)
+    buf.write('\nArgs:\n')
+    for each_input in op_proto.inputs:
+        line_begin = '    {0}: '.format(_convert_(each_input.name))
+        buf.write(line_begin)
+        buf.write(each_input.comment)
+        buf.write('\n')
+        buf.write(' ' * len(line_begin))
+        buf.write('Duplicable: ')
+        buf.write(str(each_input.duplicable))
+        buf.write('  Optional: ')
+        buf.write(str(each_input.dispensable))
+        buf.write('\n')
+
+    for each_attr in op_proto.attrs:
+        buf.write('    ')
+        buf.write(each_attr.name)
+        buf.write(' (')
+        buf.write(_type_to_str_(each_attr.type))
+        buf.write('): ')
+        buf.write(each_attr.comment)
+        buf.write('\n')
+
+    if len(op_proto.outputs) != 0:
+        buf.write('\nReturns:\n')
+        buf.write('    ')
+        for each_opt in op_proto.outputs:
+            if not each_opt.intermediate:
+                break
+        buf.write(each_opt.comment)
+
+    return buf.getvalue()
+
+
+def register_layer(op_type):
+    """Register the Python layer for an Operator.
+
+    Args:
+       op_type: The name of the operator to be created.
+
+    This function takes in the operator type (sigmoid, mean , average etc) and
+    creates the operator functionality.
+
+    """
+    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
+    not_intermediate_outputs = \
+        filter(lambda output: not output.intermediate, op_proto.outputs)
+    intermediate_outputs = \
+        filter(lambda output: output.intermediate, op_proto.outputs)
+
+    if len(not_intermediate_outputs) != 1:
+        raise ValueError("Only one non intermediate output operator can be",
+                         "automatically generated.")
+
+    if not_intermediate_outputs[0].duplicable:
+        raise ValueError(
+            "Only non duplicable op can be automatically generated.")
+
+    for output in intermediate_outputs:
+        if output.duplicable:
+            raise ValueError("The op can be automatically generated only when ",
+                             "all intermediate ops are not duplicable.")
+
+    o_name = not_intermediate_outputs[0].name
+    intermediate_output_names = [output.name for output in intermediate_outputs]
+
+    def infer_and_check_dtype(op_proto, **kwargs):
+        """
+        This function performs the sanity check for dtype and
+        instance type.
+        """
+        dtype = None
+        for ipt in op_proto.inputs:
+            name = _convert_(ipt.name)
+            val = kwargs.pop(name, [])
+            if not isinstance(val, list) and not isinstance(val, tuple):
+                val = [val]
+            for each in val:
+                if not isinstance(each, Variable):
+                    raise ValueError("input of {0} must be variable".format(
+                        op_type))
+
+                if dtype is None:
+                    dtype = each.dtype
+                elif dtype != each.dtype:
+                    raise ValueError(
+                        "operator {0} must input same dtype. {1} vs {2}".format(
+                            op_type, dtype, each.dtype))
+
+        return dtype
+
+    def func(**kwargs):
+        helper = LayerHelper(op_type, **kwargs)
+
+        dtype = infer_and_check_dtype(op_proto, **kwargs)
+
+        inputs = dict()
+        for ipt in op_proto.inputs:
+            name = _convert_(ipt.name)
+            val = kwargs.pop(name, [])
+            if not isinstance(val, list) and not isinstance(val, tuple):
+                val = [val]
+            inputs[ipt.name] = val
+
+        outputs = dict()
+        out = helper.create_tmp_variable(dtype=dtype)
+        outputs[o_name] = [out]
+        for name in intermediate_output_names:
+            outputs[name] = [helper.create_tmp_variable(dtype=dtype)]
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
+        return helper.append_activation(out)
+
+    func.__name__ = op_type
+    func.__doc__ = _generate_doc_string_(op_proto)
+    return func
+
+
+def deprecated(func_or_class):
+    """
+    Deprecated warning decorator. It will result a warning message.
+    Should be used before class or function, member function
+    """
+
+    @functools.wraps(func)
+    def func_wrapper(*args, **kwargs):
+        """
+        Wrap func with deprecated warning
+        """
+        warnings.simplefilter('always', DeprecationWarning)  # turn off filter
+        warnings.warn(
+            "Call to deprecated function {}.".format(func.__name__),
+            category=DeprecationWarning,
+            stacklevel=2)
+        warnings.simplefilter('default', DeprecationWarning)  # reset filter
+        return func(*args, **kwargs)
+
+    return func_wrapper
+
+
+def autodoc(func):
+    func.__doc__ = _generate_doc_string_(OpProtoHolder.instance().get_op_proto(
+        func.__name__))
+    return func
diff --git a/python/paddle/v2/fluid/regularizer.py b/python/paddle/v2/fluid/regularizer.py
index 098cd0dd6439554f49e429ab75fb11bfa2c9d28c..e53dee98fd025df8aa5ff2b74d3bdfd901402965 100644
--- a/python/paddle/v2/fluid/regularizer.py
+++ b/python/paddle/v2/fluid/regularizer.py
@@ -1,11 +1,26 @@
-import paddle.v2.fluid.framework as framework
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import framework
 
 __all__ = [
-    'append_regularization_ops', 'L2DecayRegularizer', 'L1DecayRegularizer'
+    'append_regularization_ops',
+    'L1Decay',
+    'L2Decay',
 ]
 
 
-def append_regularization_ops(parameters_and_grads):
+def append_regularization_ops(parameters_and_grads, regularization=None):
     """Create and add backward regularization Operators
 
     Creates and adds backward regularization operators in the BlockDesc.
@@ -16,6 +31,8 @@ def append_regularization_ops(parameters_and_grads):
     Args:
         parameters_and_grads: A list of (parameters, gradients) pairs
                               that need to be regularized.
+        regularization: A global regularizer. If the parameter is not
+                        set. It will be applied with regularizer.
 
     Returns:
         list of (parameters, gradients) pair with the regularized gradient
@@ -25,14 +42,19 @@ def append_regularization_ops(parameters_and_grads):
     """
     params_and_grads = []
     for param, grad in parameters_and_grads:
+        regularization_term = None
+        if param.regularizer is not None:
+            # Add variable for regularization term in grad block
+            regularization_term = param.regularizer(param, grad.block)
+        elif regularization is not None:
+            regularization_term = regularization(param, grad.block)
+
         # If no gradient or no regularization specified,
         # then we don't need to do anything
-        if grad is None or param.regularizer is None:
+        if grad is None or regularization_term is None:
             params_and_grads.append((param, grad))
             continue
 
-        # Add variable for regularization term in grad block
-        regularization_term = param.regularizer(param, grad.block)
         assert grad.shape == regularization_term.shape
 
         grad.block.append_op(
@@ -139,3 +161,16 @@ class L1DecayRegularizer(WeightDecayRegularizer):
             attrs={"scale": self._regularization_coeff})
 
         return decay
+
+
+# We short the class name, since users will use the regulaizer with the package
+# name. The sample code:
+#
+# import paddle.fluid as fluid
+#
+# hidden = fluid.layers.fc(...,
+#                          param_attr=fluid.regularizer.Xavier())
+#
+# It is no need to add a `Regularizer` as the class suffix
+L1Decay = L1DecayRegularizer
+L2Decay = L2DecayRegularizer
diff --git a/python/paddle/v2/fluid/tests/.gitignore b/python/paddle/v2/fluid/tests/.gitignore
index fcc52c04886865d96c1bfe1597a9dc99c181de1f..62f82151eb42342cd90657b1e4dfc93410950e62 100644
--- a/python/paddle/v2/fluid/tests/.gitignore
+++ b/python/paddle/v2/fluid/tests/.gitignore
@@ -1,2 +1,4 @@
 image/
 fit_a_line.model/
+tmp
+cuda_profiler.txt
diff --git a/python/paddle/v2/fluid/tests/CMakeLists.txt b/python/paddle/v2/fluid/tests/CMakeLists.txt
index e795627bfe9e8ad0c196349a332e62e975f20aa3..9a0240cbf65c7a79e29babc2abcb157ada684c5e 100644
--- a/python/paddle/v2/fluid/tests/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/CMakeLists.txt
@@ -5,3 +5,4 @@ foreach(src ${TEST_OPS})
 endforeach()
 
 add_subdirectory(book)
+add_subdirectory(book_distribute)
diff --git a/python/paddle/v2/fluid/tests/__init__.py b/python/paddle/v2/fluid/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2619c1c0e9db17c38ccc6e1dd010bd9c1c5966bd
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/__init__.py
@@ -0,0 +1,13 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
diff --git a/python/paddle/v2/fluid/tests/book/CMakeLists.txt b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
index 4d7664469e481344cf9eea84688f068b4fb99dee..a35abe3e0c436be4eaed01c9b9183344c6d3b275 100644
--- a/python/paddle/v2/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
@@ -1,5 +1,11 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+list(REMOVE_ITEM TEST_OPS test_image_classification_train)
+py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet)
+py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg)
+
+# default test
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
index a7f3bfc0caf76302674a00c80c2bd9ebf834f872..904df66dc1869ca4069a3f2e8dbce850c08e4253 100644
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -1,23 +1,31 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.io import save_persistables, load_persistables
-from paddle.v2.fluid.optimizer import SGDOptimizer
+import paddle.v2.fluid as fluid
 
-x = layers.data(name='x', shape=[13], data_type='float32')
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 
-y_predict = layers.fc(input=x, size=1, act=None)
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
 
-y = layers.data(name='y', shape=[1], data_type='float32')
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
-cost = layers.square_error_cost(input=y_predict, label=y)
-avg_cost = layers.mean(x=cost)
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
 
-sgd_optimizer = SGDOptimizer(learning_rate=0.001)
-opts = sgd_optimizer.minimize(avg_cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
 
 BATCH_SIZE = 20
 
@@ -26,32 +34,21 @@ train_reader = paddle.batch(
         paddle.dataset.uci_housing.train(), buf_size=500),
     batch_size=BATCH_SIZE)
 
-place = core.CPUPlace()
-exe = Executor(place)
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
 
-exe.run(framework.default_startup_program())
+exe.run(fluid.default_startup_program())
 
 PASS_NUM = 100
 for pass_id in range(PASS_NUM):
-    save_persistables(exe, "./fit_a_line.model/")
-    load_persistables(exe, "./fit_a_line.model/")
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
     for data in train_reader():
-        x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-        y_data = np.array(map(lambda x: x[1], data)).astype("float32")
-
-        tensor_x = core.LoDTensor()
-        tensor_x.set(x_data, place)
-        # print tensor_x.get_dims()
-
-        tensor_y = core.LoDTensor()
-        tensor_y.set(y_data, place)
-        # print tensor_y.get_dims()
-        outs = exe.run(framework.default_main_program(),
-                       feed={'x': tensor_x,
-                             'y': tensor_y},
-                       fetch_list=[avg_cost])
-        out = np.array(outs[0])
-
-        if out[0] < 10.0:
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+
+        if avg_loss_value[0] < 10.0:
             exit(0)  # if avg cost less than 10.0, we think our code is good.
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
index efe63a68f0745eb728b569a03d0344877c1484f7..a06486aa08733a589ac9f0c7b65bb8e769eedcb1 100644
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -1,18 +1,27 @@
-import numpy as np
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import print_function
+
+import sys
+
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
-import paddle.v2.fluid.evaluator as evaluator
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.initializer import XavierInitializer
-from paddle.v2.fluid.optimizer import AdamOptimizer
+import paddle.v2.fluid as fluid
 
 
 def resnet_cifar10(input, depth=32):
     def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
-        tmp = layers.conv2d(
+        tmp = fluid.layers.conv2d(
             input=input,
             filter_size=filter_size,
             num_filters=ch_out,
@@ -20,12 +29,11 @@ def resnet_cifar10(input, depth=32):
             padding=padding,
             act=None,
             bias_attr=False)
-        return layers.batch_norm(input=tmp, act=act)
+        return fluid.layers.batch_norm(input=tmp, act=act)
 
-    def shortcut(input, ch_in, ch_out, stride, program, init_program):
+    def shortcut(input, ch_in, ch_out, stride):
         if ch_in != ch_out:
-            return conv_bn_layer(input, ch_out, 1, stride, 0, None, program,
-                                 init_program)
+            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
         else:
             return input
 
@@ -33,7 +41,7 @@ def resnet_cifar10(input, depth=32):
         tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
         tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
         short = shortcut(input, ch_in, ch_out, stride)
-        return layers.elementwise_add(x=tmp, y=short, act='relu')
+        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
 
     def layer_warp(block_func, input, ch_in, ch_out, count, stride):
         tmp = block_func(input, ch_in, ch_out, stride)
@@ -48,14 +56,14 @@ def resnet_cifar10(input, depth=32):
     res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
     res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
     res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
-    pool = layers.pool2d(
+    pool = fluid.layers.pool2d(
         input=res3, pool_size=8, pool_type='avg', pool_stride=1)
     return pool
 
 
 def vgg16_bn_drop(input):
     def conv_block(input, num_filter, groups, dropouts):
-        return nets.img_conv_group(
+        return fluid.nets.img_conv_group(
             input=input,
             pool_size=2,
             pool_stride=2,
@@ -72,44 +80,41 @@ def vgg16_bn_drop(input):
     conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
     conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
 
-    drop = layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = layers.fc(input=drop,
-                    size=512,
-                    act=None,
-                    param_attr={"initializer": XavierInitializer()})
-    reshape1 = layers.reshape(x=fc1, shape=list(fc1.shape + (1, 1)))
-    bn = layers.batch_norm(input=reshape1, act='relu')
-    drop2 = layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = layers.fc(input=drop2,
-                    size=512,
-                    act=None,
-                    param_attr={"initializer": XavierInitializer()})
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
     return fc2
 
 
 classdim = 10
 data_shape = [3, 32, 32]
 
-images = layers.data(name='pixel', shape=data_shape, data_type='float32')
-label = layers.data(name='label', shape=[1], data_type='int64')
+images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
-# Add neural network config
-# option 1. resnet
-# net = resnet_cifar10(images, 32)
-# option 2. vgg
-net = vgg16_bn_drop(images)
+net_type = "vgg"
+if len(sys.argv) >= 2:
+    net_type = sys.argv[1]
 
-# print(program)
+if net_type == "vgg":
+    print("train vgg net")
+    net = vgg16_bn_drop(images)
+elif net_type == "resnet":
+    print("train resnet")
+    net = resnet_cifar10(images, 32)
+else:
+    raise ValueError("%s network is not supported" % net_type)
 
-predict = layers.fc(input=net, size=classdim, act='softmax')
-cost = layers.cross_entropy(input=predict, label=label)
-avg_cost = layers.mean(x=cost)
+predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
 
-# optimizer = SGDOptimizer(learning_rate=0.001)
-optimizer = AdamOptimizer(learning_rate=0.001)
+optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 opts = optimizer.minimize(avg_cost)
 
-accuracy, acc_out = evaluator.accuracy(input=predict, label=label)
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
 
 BATCH_SIZE = 128
 PASS_NUM = 1
@@ -119,42 +124,20 @@ train_reader = paddle.batch(
         paddle.dataset.cifar.train10(), buf_size=128 * 10),
     batch_size=BATCH_SIZE)
 
-place = core.CPUPlace()
-exe = Executor(place)
-
-exe.run(framework.default_startup_program())
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
+exe.run(fluid.default_startup_program())
 
 for pass_id in range(PASS_NUM):
-    batch_id = 0
     accuracy.reset(exe)
     for data in train_reader():
-        img_data = np.array(map(lambda x: x[0].reshape(data_shape),
-                                data)).astype("float32")
-        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-        batch_size = 1
-        for i in y_data.shape:
-            batch_size = batch_size * i
-        y_data = y_data.reshape([batch_size, 1])
-
-        tensor_img = core.LoDTensor()
-        tensor_y = core.LoDTensor()
-        tensor_img.set(img_data, place)
-        tensor_y.set(y_data, place)
-
-        outs = exe.run(framework.default_main_program(),
-                       feed={"pixel": tensor_img,
-                             "label": tensor_y},
-                       fetch_list=[avg_cost, acc_out])
-
-        loss = np.array(outs[0])
-        acc = np.array(outs[1])
+        loss, acc = exe.run(fluid.default_main_program(),
+                            feed=feeder.feed(data),
+                            fetch_list=[avg_cost] + accuracy.metrics)
         pass_acc = accuracy.eval(exe)
-        print("pass_id:" + str(pass_id) + " batch_id:" + str(batch_id) +
-              " loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
-                  pass_acc))
-        batch_id = batch_id + 1
-
-        if batch_id > 1:
-            # this model is slow, so if we can train two mini batch, we think it works properly.
-            exit(0)
+        print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
+            pass_acc))
+        # this model is slow, so if we can train two mini batch, we think it works properly.
+        exit(0)
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
new file mode 100644
index 0000000000000000000000000000000000000000..42971da0f042e970f7f657e4fd3a66ad7ecf0dc1
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -0,0 +1,221 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import math
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.fluid as fluid
+import time
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_len = len(verb_dict)
+
+mark_dict_len = 2
+word_dim = 32
+mark_dim = 5
+hidden_dim = 512
+depth = 8
+mix_hidden_lr = 1e-3
+
+IS_SPARSE = True
+PASS_NUM = 10
+BATCH_SIZE = 20
+
+embedding_name = 'emb'
+
+
+def load_parameter(file_name, h, w):
+    with open(file_name, 'rb') as f:
+        f.read(16)  # skip header.
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+
+def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
+            **ignored):
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        size=[pred_len, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark,
+        size=[mark_dict_len, mark_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[word_dict_len, word_dim],
+            input=x,
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=hidden_dim,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=hidden_dim,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
+    ])
+
+    return feature_out
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    # define network topology
+    word = fluid.layers.data(
+        name='word_data', shape=[1], dtype='int64', lod_level=1)
+    predicate = fluid.layers.data(
+        name='verb_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n2 = fluid.layers.data(
+        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n1 = fluid.layers.data(
+        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_0 = fluid.layers.data(
+        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p1 = fluid.layers.data(
+        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p2 = fluid.layers.data(
+        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+    mark = fluid.layers.data(
+        name='mark_data', shape=[1], dtype='int64', lod_level=1)
+    feature_out = db_lstm(**locals())
+    target = fluid.layers.data(
+        name='target', shape=[1], dtype='int64', lod_level=1)
+    crf_cost = fluid.layers.linear_chain_crf(
+        input=feature_out,
+        label=target,
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=mix_hidden_lr))
+    avg_cost = fluid.layers.mean(x=crf_cost)
+
+    # TODO(qiao)
+    # check other optimizers and check why out will be NAN
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+    sgd_optimizer.minimize(avg_cost)
+
+    # TODO(qiao)
+    # add dependency track and move this config before optimizer
+    crf_decode = fluid.layers.crf_decoding(
+        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
+
+    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
+        input=crf_decode,
+        label=target,
+        chunk_scheme="IOB",
+        num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.conll05.test(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+    #place = fluid.CPUPlace()
+    place = fluid.CUDAPlace(0)
+    feeder = fluid.DataFeeder(
+        feed_list=[
+            word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
+        ],
+        place=place)
+    exe = fluid.Executor(place)
+
+    exe.run(fluid.default_startup_program())
+
+    embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor()
+    embedding_param.set(
+        load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)
+
+    start_time = time.time()
+    batch_id = 0
+    for pass_id in xrange(PASS_NUM):
+        chunk_evaluator.reset(exe)
+        for data in train_data():
+            cost, precision, recall, f1_score = exe.run(
+                fluid.default_main_program(),
+                feed=feeder.feed(data),
+                fetch_list=[avg_cost] + chunk_evaluator.metrics)
+            pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
+                exe)
+
+            if batch_id % 10 == 0:
+                print("avg_cost:" + str(cost) + " precision:" + str(
+                    precision) + " recall:" + str(recall) + " f1_score:" + str(
+                        f1_score) + " pass_precision:" + str(
+                            pass_precision) + " pass_recall:" + str(pass_recall)
+                      + " pass_f1_score:" + str(pass_f1_score))
+                if batch_id != 0:
+                    print("second per batch: " + str((time.time() - start_time)
+                                                     / batch_id))
+
+            # exit early for CI
+            exit(0)
+
+            batch_id = batch_id + 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..deeb6b1badc9e12caf4c95f949c3c622b04cf8b4
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
@@ -0,0 +1,132 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 32
+word_dim = 16
+IS_SPARSE = True
+batch_size = 10
+max_length = 50
+topk_size = 50
+trg_dic_size = 10000
+
+decoder_size = hidden_dim
+
+
+def encoder_decoder():
+    # encoder
+    src_word_id = layers.data(
+        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = layers.embedding(
+        input=src_word_id,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
+
+    # decoder
+    trg_language_word = layers.data(
+        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = layers.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        current_word = rnn.step_input(trg_embedding)
+        mem = rnn.memory(init=encoder_out)
+        fc1 = fluid.layers.fc(input=[current_word, mem],
+                              size=decoder_size,
+                              act='tanh')
+        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
+        rnn.update_memory(mem, fc1)
+        rnn.output(out)
+
+    return rnn()
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    rnn_out = encoder_decoder()
+    label = layers.data(
+        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = layers.cross_entropy(input=rnn_out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
+    optimizer.minimize(avg_cost)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    place = core.CPUPlace()
+    exe = Executor(place)
+
+    exe.run(framework.default_startup_program())
+
+    batch_id = 0
+    for pass_id in xrange(2):
+        for data in train_data():
+            word_data = to_lodtensor(map(lambda x: x[0], data), place)
+            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
+            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
+            outs = exe.run(framework.default_main_program(),
+                           feed={
+                               'src_word_id': word_data,
+                               'target_language_word': trg_word,
+                               'target_language_next_word': trg_word_next
+                           },
+                           fetch_list=[avg_cost])
+            avg_cost_val = np.array(outs[0])
+            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                  " avg_cost=" + str(avg_cost_val))
+            if batch_id > 3:
+                exit(0)
+            batch_id += 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
index 8f737689609fec4d1819ae58b9665298547a3716..1d5defbed332e8c4c989e6c1f236836bb4b0a3f9 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
@@ -1,23 +1,31 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import print_function
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.evaluator as evaluator
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.optimizer import AdamOptimizer
+import paddle.v2.fluid as fluid
 
-images = layers.data(name='pixel', shape=[1, 28, 28], data_type='float32')
-label = layers.data(name='label', shape=[1], data_type='int64')
-conv_pool_1 = nets.simple_img_conv_pool(
+images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+conv_pool_1 = fluid.nets.simple_img_conv_pool(
     input=images,
     filter_size=5,
     num_filters=20,
     pool_size=2,
     pool_stride=2,
     act="relu")
-conv_pool_2 = nets.simple_img_conv_pool(
+conv_pool_2 = fluid.nets.simple_img_conv_pool(
     input=conv_pool_1,
     filter_size=5,
     num_filters=50,
@@ -25,13 +33,13 @@ conv_pool_2 = nets.simple_img_conv_pool(
     pool_stride=2,
     act="relu")
 
-predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
-cost = layers.cross_entropy(input=predict, label=label)
-avg_cost = layers.mean(x=cost)
-optimizer = AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
-opts = optimizer.minimize(avg_cost)
+predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+optimizer.minimize(avg_cost)
 
-accuracy, acc_out = evaluator.accuracy(input=predict, label=label)
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
 
 BATCH_SIZE = 50
 PASS_NUM = 3
@@ -40,30 +48,17 @@ train_reader = paddle.batch(
         paddle.dataset.mnist.train(), buf_size=500),
     batch_size=BATCH_SIZE)
 
-place = core.CPUPlace()
-exe = Executor(place)
-
-exe.run(framework.default_startup_program())
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+exe.run(fluid.default_startup_program())
 
 for pass_id in range(PASS_NUM):
     accuracy.reset(exe)
     for data in train_reader():
-        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
-                                data)).astype("float32")
-        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-        y_data = y_data.reshape([BATCH_SIZE, 1])
-
-        tensor_img = core.LoDTensor()
-        tensor_y = core.LoDTensor()
-        tensor_img.set(img_data, place)
-        tensor_y.set(y_data, place)
-
-        outs = exe.run(framework.default_main_program(),
-                       feed={"pixel": tensor_img,
-                             "label": tensor_y},
-                       fetch_list=[avg_cost, acc_out])
-        loss = np.array(outs[0])
-        acc = np.array(outs[1])
+        loss, acc = exe.run(fluid.default_main_program(),
+                            feed=feeder.feed(data),
+                            fetch_list=[avg_cost] + accuracy.metrics)
         pass_acc = accuracy.eval(exe)
         print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc=" +
               str(pass_acc))
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
index e42e4c9cc0024e193b0732df6d9ca3200df5f0b9..02da2fcc8544d0f3ccfefa3c88af2f7297d1c76f 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
@@ -1,76 +1,95 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import print_function
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.evaluator as evaluator
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.initializer import UniformInitializer
-from paddle.v2.fluid.optimizer import MomentumOptimizer
-from paddle.v2.fluid.regularizer import L2DecayRegularizer
+import paddle.v2.fluid as fluid
 
 BATCH_SIZE = 128
-image = layers.data(name='x', shape=[784], data_type='float32')
+image = fluid.layers.data(name='x', shape=[784], dtype='float32')
 
-param_attr = {
-    'name': None,
-    'initializer': UniformInitializer(
-        low=-1.0, high=1.0),
-    'regularization': L2DecayRegularizer(0.0005 * BATCH_SIZE)
-}
+regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
 
-hidden1 = layers.fc(input=image, size=128, act='relu', param_attr=param_attr)
-hidden2 = layers.fc(input=hidden1, size=64, act='relu', param_attr=param_attr)
+hidden1 = fluid.layers.fc(input=image,
+                          size=128,
+                          act='relu',
+                          param_attr=fluid.ParamAttr(
+                              regularizer=regularizer,
+                              clip=fluid.clip.ClipByValue(10)))
 
-predict = layers.fc(input=hidden2,
-                    size=10,
-                    act='softmax',
-                    param_attr=param_attr)
+hidden2 = fluid.layers.fc(input=hidden1,
+                          size=64,
+                          act='relu',
+                          param_attr=regularizer)
 
-label = layers.data(name='y', shape=[1], data_type='int64')
+predict = fluid.layers.fc(input=hidden2,
+                          size=10,
+                          act='softmax',
+                          param_attr=regularizer)
 
-cost = layers.cross_entropy(input=predict, label=label)
-avg_cost = layers.mean(x=cost)
+label = fluid.layers.data(name='y', shape=[1], dtype='int64')
 
-optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+
+optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
 opts = optimizer.minimize(avg_cost)
 
-accuracy, acc_out = evaluator.accuracy(input=predict, label=label)
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+inference_program = fluid.default_main_program().clone()
+with fluid.program_guard(inference_program):
+    test_accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
+    inference_program = fluid.io.get_inference_program(test_target)
 
 train_reader = paddle.batch(
     paddle.reader.shuffle(
         paddle.dataset.mnist.train(), buf_size=8192),
     batch_size=BATCH_SIZE)
 
-place = core.CPUPlace()
-exe = Executor(place)
+test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
 
-exe.run(framework.default_startup_program())
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
+exe.run(fluid.default_startup_program())
 
 PASS_NUM = 100
 for pass_id in range(PASS_NUM):
     accuracy.reset(exe)
     for data in train_reader():
-        x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-        y_data = np.expand_dims(y_data, axis=1)
-
-        tensor_x = core.LoDTensor()
-        tensor_x.set(x_data, place)
-
-        tensor_y = core.LoDTensor()
-        tensor_y.set(y_data, place)
-
-        outs = exe.run(framework.default_main_program(),
-                       feed={'x': tensor_x,
-                             'y': tensor_y},
-                       fetch_list=[avg_cost, acc_out])
-        out = np.array(outs[0])
-        acc = np.array(outs[1])
+        out, acc = exe.run(fluid.default_main_program(),
+                           feed=feeder.feed(data),
+                           fetch_list=[avg_cost] + accuracy.metrics)
         pass_acc = accuracy.eval(exe)
 
-        if pass_acc > 0.7:
+        test_accuracy.reset(exe)
+        for data in test_reader():
+            out, acc = exe.run(inference_program,
+                               feed=feeder.feed(data),
+                               fetch_list=[avg_cost] + test_accuracy.metrics)
+
+        test_pass_acc = test_accuracy.eval(exe)
+        print("pass_id=" + str(pass_id) + " train_cost=" + str(
+            out) + " train_acc=" + str(acc) + " train_pass_acc=" + str(pass_acc)
+              + " test_acc=" + str(test_pass_acc))
+
+        if test_pass_acc > 0.7:
+            fluid.io.save_inference_model(
+                "./recognize_digits_mlp.inference.model/", ["x"], [predict],
+                exe)
             exit(0)
-            # print("pass_id=" + str(pass_id) + " auc=" +
-            #      str(acc) + " pass_acc=" + str(pass_acc))
+
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
index 55ded3aed3a23c8cd7795f915dc1cbd512c6d945..47e2afcd83be12c37d95574afff7ccd2e8a781a6 100644
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid.core as core
@@ -18,47 +31,47 @@ def get_usr_combined_features():
 
     USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
 
-    uid = layers.data(name='user_id', shape=[1], data_type='int64')
+    uid = layers.data(name='user_id', shape=[1], dtype='int64')
 
     usr_emb = layers.embedding(
         input=uid,
-        data_type='float32',
+        dtype='float32',
         size=[USR_DICT_SIZE, 32],
-        param_attr={'name': 'user_table'},
+        param_attr='user_table',
         is_sparse=IS_SPARSE)
 
     usr_fc = layers.fc(input=usr_emb, size=32)
 
     USR_GENDER_DICT_SIZE = 2
 
-    usr_gender_id = layers.data(name='gender_id', shape=[1], data_type='int64')
+    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
 
     usr_gender_emb = layers.embedding(
         input=usr_gender_id,
         size=[USR_GENDER_DICT_SIZE, 16],
-        param_attr={'name': 'gender_table'},
+        param_attr='gender_table',
         is_sparse=IS_SPARSE)
 
     usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
 
     USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
-    usr_age_id = layers.data(name='age_id', shape=[1], data_type="int64")
+    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
 
     usr_age_emb = layers.embedding(
         input=usr_age_id,
         size=[USR_AGE_DICT_SIZE, 16],
         is_sparse=IS_SPARSE,
-        param_attr={'name': 'age_table'})
+        param_attr='age_table')
 
     usr_age_fc = layers.fc(input=usr_age_emb, size=16)
 
     USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
-    usr_job_id = layers.data(name='job_id', shape=[1], data_type="int64")
+    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
 
     usr_job_emb = layers.embedding(
         input=usr_job_id,
         size=[USR_JOB_DICT_SIZE, 16],
-        param_attr={'name': 'job_table'},
+        param_attr='job_table',
         is_sparse=IS_SPARSE)
 
     usr_job_fc = layers.fc(input=usr_job_emb, size=16)
@@ -75,20 +88,20 @@ def get_mov_combined_features():
 
     MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
 
-    mov_id = layers.data(name='movie_id', shape=[1], data_type='int64')
+    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
 
     mov_emb = layers.embedding(
         input=mov_id,
-        data_type='float32',
+        dtype='float32',
         size=[MOV_DICT_SIZE, 32],
-        param_attr={'name': 'movie_table'},
+        param_attr='movie_table',
         is_sparse=IS_SPARSE)
 
     mov_fc = layers.fc(input=mov_emb, size=32)
 
     CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
 
-    category_id = layers.data(name='category_id', shape=[1], data_type='int64')
+    category_id = layers.data(name='category_id', shape=[1], dtype='int64')
 
     mov_categories_emb = layers.embedding(
         input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
@@ -98,7 +111,7 @@ def get_mov_combined_features():
 
     MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
 
-    mov_title_id = layers.data(name='movie_title', shape=[1], data_type='int64')
+    mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
 
     mov_title_emb = layers.embedding(
         input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
@@ -125,10 +138,11 @@ def model():
 
     # need cos sim
     inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
+    scale_infer = layers.scale(x=inference, scale=5.0)
 
-    label = layers.data(name='score', shape=[1], data_type='float32')
+    label = layers.data(name='score', shape=[1], dtype='float32')
 
-    square_cost = layers.square_error_cost(input=inference, label=label)
+    square_cost = layers.square_error_cost(input=scale_infer, label=label)
 
     avg_cost = layers.mean(x=square_cost)
 
@@ -141,7 +155,7 @@ def main():
     opts = sgd_optimizer.minimize(cost)
 
     if USE_GPU:
-        place = core.GPUPlace(0)
+        place = core.CUDAPlace(0)
     else:
         place = core.CPUPlace()
 
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
index 4929f7cf615e61de5c4f61ef44c5340e9ac4492a..b44d2b41e36633ed8feb7dd8c0e3884e48ffe917 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
@@ -1,40 +1,46 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import print_function
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.evaluator as evaluator
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.optimizer import AdamOptimizer
+import paddle.v2.fluid as fluid
 
 
-def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32):
-    data = layers.data(name="words", shape=[1], data_type="int64")
-    label = layers.data(name="label", shape=[1], data_type="int64")
-
-    emb = layers.embedding(input=data, size=[input_dim, emb_dim])
-    conv_3 = nets.sequence_conv_pool(
+def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
+                    hid_dim=32):
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    conv_3 = fluid.nets.sequence_conv_pool(
         input=emb,
         num_filters=hid_dim,
         filter_size=3,
         act="tanh",
         pool_type="sqrt")
-    conv_4 = nets.sequence_conv_pool(
+    conv_4 = fluid.nets.sequence_conv_pool(
         input=emb,
         num_filters=hid_dim,
         filter_size=4,
         act="tanh",
         pool_type="sqrt")
-    prediction = layers.fc(input=[conv_3, conv_4],
-                           size=class_dim,
-                           act="softmax")
-    cost = layers.cross_entropy(input=prediction, label=label)
-    avg_cost = layers.mean(x=cost)
-    adam_optimizer = AdamOptimizer(learning_rate=0.002)
-    opts = adam_optimizer.minimize(avg_cost)
-    accuracy, acc_out = evaluator.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, acc_out
+    prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, accuracy.metrics[0]
 
 
 def to_lodtensor(data, place):
@@ -46,7 +52,7 @@ def to_lodtensor(data, place):
         lod.append(cur_len)
     flattened_data = np.concatenate(data, axis=0).astype("int64")
     flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
+    res = fluid.LoDTensor()
     res.set(flattened_data, place)
     res.set_lod([lod])
     return res
@@ -60,35 +66,28 @@ def main():
     dict_dim = len(word_dict)
     class_dim = 2
 
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
     cost, accuracy, acc_out = convolution_net(
-        input_dim=dict_dim, class_dim=class_dim)
+        data, label, input_dim=dict_dim, class_dim=class_dim)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.imdb.train(word_dict), buf_size=1000),
         batch_size=BATCH_SIZE)
-    place = core.CPUPlace()
-    exe = Executor(place)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
 
-    exe.run(framework.default_startup_program())
+    exe.run(fluid.default_startup_program())
 
     for pass_id in xrange(PASS_NUM):
         accuracy.reset(exe)
         for data in train_data():
-            tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
-
-            label = np.array(map(lambda x: x[1], data)).astype("int64")
-            label = label.reshape([BATCH_SIZE, 1])
-
-            tensor_label = core.LoDTensor()
-            tensor_label.set(label, place)
-
-            outs = exe.run(framework.default_main_program(),
-                           feed={"words": tensor_words,
-                                 "label": tensor_label},
-                           fetch_list=[cost, acc_out])
-            cost_val = np.array(outs[0])
-            acc_val = np.array(outs[1])
+            cost_val, acc_val = exe.run(fluid.default_main_program(),
+                                        feed=feeder.feed(data),
+                                        fetch_list=[cost, acc_out])
             pass_acc = accuracy.eval(exe)
             print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
                   " pass_acc=" + str(pass_acc))
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
index b3ee91938865afb929670a388a561b156aec1fe9..5a139c1dcd41305aa5aece96f6b2aabde0235b95 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
@@ -1,49 +1,57 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.evaluator as evaluator
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.optimizer import AdamOptimizer
+import paddle.v2.fluid as fluid
 
 
-def stacked_lstm_net(input_dim,
+def stacked_lstm_net(data,
+                     label,
+                     input_dim,
                      class_dim=2,
                      emb_dim=128,
                      hid_dim=512,
                      stacked_num=3):
     assert stacked_num % 2 == 1
-    data = layers.data(name="words", shape=[1], data_type="int64")
-    label = layers.data(name="label", shape=[1], data_type="int64")
 
-    emb = layers.embedding(input=data, size=[input_dim, emb_dim])
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
     # add bias attr
 
     # TODO(qijun) linear act
-    fc1 = layers.fc(input=emb, size=hid_dim)
-    lstm1, cell1 = layers.dynamic_lstm(input=fc1, size=hid_dim)
+    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
+    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
 
     inputs = [fc1, lstm1]
 
     for i in range(2, stacked_num + 1):
-        fc = layers.fc(input=inputs, size=hid_dim)
-        lstm, cell = layers.dynamic_lstm(
+        fc = fluid.layers.fc(input=inputs, size=hid_dim)
+        lstm, cell = fluid.layers.dynamic_lstm(
             input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
         inputs = [fc, lstm]
 
-    fc_last = layers.sequence_pool(input=inputs[0], pool_type='max')
-    lstm_last = layers.sequence_pool(input=inputs[1], pool_type='max')
+    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
+    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
 
-    prediction = layers.fc(input=[fc_last, lstm_last],
-                           size=class_dim,
-                           act='softmax')
-    cost = layers.cross_entropy(input=prediction, label=label)
-    avg_cost = layers.mean(x=cost)
-    adam_optimizer = AdamOptimizer(learning_rate=0.002)
-    opts = adam_optimizer.minimize(avg_cost)
-    accuracy, acc_out = evaluator.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, acc_out
+    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
+                                 size=class_dim,
+                                 act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, accuracy.metrics[0]
 
 
 def to_lodtensor(data, place):
@@ -55,7 +63,7 @@ def to_lodtensor(data, place):
         lod.append(cur_len)
     flattened_data = np.concatenate(data, axis=0).astype("int64")
     flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
+    res = fluid.LoDTensor()
     res.set(flattened_data, place)
     res.set_lod([lod])
     return res
@@ -70,35 +78,28 @@ def main():
     dict_dim = len(word_dict)
     class_dim = 2
 
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
     cost, accuracy, acc_out = stacked_lstm_net(
-        input_dim=dict_dim, class_dim=class_dim)
+        data, label, input_dim=dict_dim, class_dim=class_dim)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.imdb.train(word_dict), buf_size=1000),
         batch_size=BATCH_SIZE)
-    place = core.CPUPlace()
-    exe = Executor(place)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
 
-    exe.run(framework.default_startup_program())
+    exe.run(fluid.default_startup_program())
 
     for pass_id in xrange(PASS_NUM):
         accuracy.reset(exe)
         for data in train_data():
-            tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
-
-            label = np.array(map(lambda x: x[1], data)).astype("int64")
-            label = label.reshape([BATCH_SIZE, 1])
-
-            tensor_label = core.LoDTensor()
-            tensor_label.set(label, place)
-
-            outs = exe.run(framework.default_main_program(),
-                           feed={"words": tensor_words,
-                                 "label": tensor_label},
-                           fetch_list=[cost, acc_out])
-            cost_val = np.array(outs[0])
-            acc_val = np.array(outs[1])
+            cost_val, acc_val = exe.run(fluid.default_main_program(),
+                                        feed=feeder.feed(data),
+                                        fetch_list=[cost, acc_out])
             pass_acc = accuracy.eval(exe)
             print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
                   " pass_acc=" + str(pass_acc))
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
index 280f6e902c34512735a27586221c2be68963ef2b..fab8a82f85ddeed7131df3777e978cc7c0a1b86e 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
@@ -1,40 +1,86 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.optimizer import AdamOptimizer
+import paddle.v2.fluid as fluid
+from paddle.v2.fluid.layer_helper import LayerHelper
+
+
+def lstm(x, c_pre_init, hidden_dim, forget_bias=None):
+    """
+    This function helps create an operator for the LSTM (Long Short Term
+    Memory) cell that can be used inside an RNN.
+    """
+    helper = LayerHelper('lstm_unit', **locals())
+    rnn = fluid.layers.StaticRNN()
+    with rnn.step():
+        c_pre = rnn.memory(init=c_pre_init)
+        x_t = rnn.step_input(x)
+
+        before_fc = fluid.layers.concat(input=[x_t, c_pre], axis=1)
+        after_fc = fluid.layers.fc(input=before_fc, size=hidden_dim * 4)
+
+        dtype = x.dtype
+        c = helper.create_tmp_variable(dtype)
+        h = helper.create_tmp_variable(dtype)
+
+        helper.append_op(
+            type='lstm_unit',
+            inputs={"X": after_fc,
+                    "C_prev": c_pre},
+            outputs={"C": c,
+                     "H": h},
+            attrs={"forget_bias": forget_bias})
+
+        rnn.update_memory(c_pre, c)
+        rnn.output(h)
+
+    return rnn()
 
 
 def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
-    data = layers.data(
+    data = fluid.layers.data(
         name="words",
         shape=[seq_len * batch_size, 1],
         append_batch_size=False,
-        data_type="int64")
-    label = layers.data(
+        dtype="int64",
+        lod_level=1)
+    label = fluid.layers.data(
         name="label",
         shape=[batch_size, 1],
         append_batch_size=False,
-        data_type="int64")
+        dtype="int64")
 
-    emb = layers.embedding(input=data, size=[dict_dim, emb_dim])
-    emb = layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim])
-    emb = layers.transpose(x=emb, axis=[1, 0, 2])
+    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
+    emb = fluid.layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim])
+    emb = fluid.layers.transpose(x=emb, axis=[1, 0, 2])
 
-    c_pre_init = layers.fill_constant(
-        dtype=emb.data_type, shape=[batch_size, emb_dim], value=0.0)
-    layer_1_out = layers.lstm(emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
-    layer_1_out = layers.transpose(x=layer_1_out, axis=[1, 0, 2])
+    c_pre_init = fluid.layers.fill_constant(
+        dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0)
+    c_pre_init.stop_gradient = False
+    layer_1_out = lstm(emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
+    layer_1_out = fluid.layers.transpose(x=layer_1_out, axis=[1, 0, 2])
 
-    prediction = layers.fc(input=layer_1_out, size=class_dim, act="softmax")
-    cost = layers.cross_entropy(input=prediction, label=label)
+    prediction = fluid.layers.fc(input=layer_1_out,
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
 
-    avg_cost = layers.mean(x=cost)
-    adam_optimizer = AdamOptimizer(learning_rate=0.002)
-    opts = adam_optimizer.minimize(avg_cost)
-    acc = layers.accuracy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
 
     return avg_cost, acc
 
@@ -48,57 +94,65 @@ def to_lodtensor(data, place):
         lod.append(cur_len)
     flattened_data = np.concatenate(data, axis=0).astype("int64")
     flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
+    res = fluid.LoDTensor()
     res.set(flattened_data, place)
     res.set_lod([lod])
     return res
 
 
-def chop_data(data, chop_len=80, batch_len=50):
+def chop_data(data, chop_len=80, batch_size=50):
     data = [(x[0][:chop_len], x[1]) for x in data if len(x[0]) >= chop_len]
 
-    return data[:batch_len]
+    return data[:batch_size]
 
 
 def prepare_feed_data(data, place):
     tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
 
     label = np.array(map(lambda x: x[1], data)).astype("int64")
-    label = label.reshape([50, 1])
-    tensor_label = core.LoDTensor()
+    label = label.reshape([len(label), 1])
+    tensor_label = fluid.LoDTensor()
     tensor_label.set(label, place)
 
     return tensor_words, tensor_label
 
 
 def main():
+    BATCH_SIZE = 100
+    PASS_NUM = 5
+
     word_dict = paddle.dataset.imdb.word_dict()
-    cost, acc = lstm_net(dict_dim=len(word_dict), class_dim=2)
+    print "load word dict successfully"
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    cost, acc = lstm_net(dict_dim=dict_dim, class_dim=class_dim)
 
-    batch_size = 100
     train_data = paddle.batch(
-        paddle.reader.buffered(
-            paddle.dataset.imdb.train(word_dict), size=batch_size * 10),
-        batch_size=batch_size)
-
-    data = chop_data(next(train_data()))
-
-    place = core.CPUPlace()
-    tensor_words, tensor_label = prepare_feed_data(data, place)
-    exe = Executor(place)
-    exe.run(framework.default_startup_program())
-
-    while True:
-        outs = exe.run(framework.default_main_program(),
-                       feed={"words": tensor_words,
-                             "label": tensor_label},
-                       fetch_list=[cost, acc])
-        cost_val = np.array(outs[0])
-        acc_val = np.array(outs[1])
-
-        print("cost=" + str(cost_val) + " acc=" + str(acc_val))
-        if acc_val > 0.9:
-            break
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=BATCH_SIZE * 10),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in xrange(PASS_NUM):
+        for data in train_data():
+            chopped_data = chop_data(data)
+            tensor_words, tensor_label = prepare_feed_data(chopped_data, place)
+
+            outs = exe.run(fluid.default_main_program(),
+                           feed={"words": tensor_words,
+                                 "label": tensor_label},
+                           fetch_list=[cost, acc])
+            cost_val = np.array(outs[0])
+            acc_val = np.array(outs[1])
+
+            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
+            if acc_val > 0.7:
+                exit(0)
+    exit(1)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py
index afa7b285198e0349317e123e4bd98e8336217afa..3d4bbccd33d7d9d0f492a76e04b98d8f7efac91d 100644
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -1,10 +1,19 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.optimizer import SGDOptimizer
+import paddle.v2.fluid as fluid
 
 PASS_NUM = 100
 EMBED_SIZE = 32
@@ -16,94 +25,62 @@ IS_SPARSE = True
 word_dict = paddle.dataset.imikolov.build_dict()
 dict_size = len(word_dict)
 
-first_word = layers.data(name='firstw', shape=[1], data_type='int64')
-second_word = layers.data(name='secondw', shape=[1], data_type='int64')
-third_word = layers.data(name='thirdw', shape=[1], data_type='int64')
-forth_word = layers.data(name='forthw', shape=[1], data_type='int64')
-next_word = layers.data(name='nextw', shape=[1], data_type='int64')
+first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
 
-embed_first = layers.embedding(
+embed_first = fluid.layers.embedding(
     input=first_word,
     size=[dict_size, EMBED_SIZE],
-    data_type='float32',
+    dtype='float32',
     is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
-embed_second = layers.embedding(
+    param_attr='shared_w')
+embed_second = fluid.layers.embedding(
     input=second_word,
     size=[dict_size, EMBED_SIZE],
-    data_type='float32',
+    dtype='float32',
     is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
-embed_third = layers.embedding(
+    param_attr='shared_w')
+embed_third = fluid.layers.embedding(
     input=third_word,
     size=[dict_size, EMBED_SIZE],
-    data_type='float32',
+    dtype='float32',
     is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
-embed_forth = layers.embedding(
+    param_attr='shared_w')
+embed_forth = fluid.layers.embedding(
     input=forth_word,
     size=[dict_size, EMBED_SIZE],
-    data_type='float32',
+    dtype='float32',
     is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
+    param_attr='shared_w')
 
-concat_embed = layers.concat(
+concat_embed = fluid.layers.concat(
     input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
-hidden1 = layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
-predict_word = layers.fc(input=hidden1, size=dict_size, act='softmax')
-cost = layers.cross_entropy(input=predict_word, label=next_word)
-avg_cost = layers.mean(x=cost)
-sgd_optimizer = SGDOptimizer(learning_rate=0.001)
-opts = sgd_optimizer.minimize(avg_cost)
+hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
+predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
+cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
+avg_cost = fluid.layers.mean(x=cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
 
 train_reader = paddle.batch(
     paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
 
-place = core.CPUPlace()
-exe = Executor(place)
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(
+    feed_list=[first_word, second_word, third_word, forth_word, next_word],
+    place=place)
 
-# fix https://github.com/PaddlePaddle/Paddle/issues/5434 then remove
-# below exit line.
-exit(0)
-
-exe.run(framework.default_startup_program())
+exe.run(fluid.default_startup_program())
 
 for pass_id in range(PASS_NUM):
     for data in train_reader():
-        input_data = [[data_idx[idx] for data_idx in data] for idx in xrange(5)]
-        input_data = map(lambda x: np.array(x).astype("int64"), input_data)
-        input_data = map(lambda x: np.expand_dims(x, axis=1), input_data)
-
-        first_data = input_data[0]
-        first_tensor = core.LoDTensor()
-        first_tensor.set(first_data, place)
-
-        second_data = input_data[1]
-        second_tensor = core.LoDTensor()
-        second_tensor.set(second_data, place)
-
-        third_data = input_data[2]
-        third_tensor = core.LoDTensor()
-        third_tensor.set(third_data, place)
-
-        forth_data = input_data[3]
-        forth_tensor = core.LoDTensor()
-        forth_tensor.set(forth_data, place)
-
-        next_data = input_data[4]
-        next_tensor = core.LoDTensor()
-        next_tensor.set(next_data, place)
-
-        outs = exe.run(framework.default_main_program(),
-                       feed={
-                           'firstw': first_tensor,
-                           'secondw': second_tensor,
-                           'thirdw': third_tensor,
-                           'forthw': forth_tensor,
-                           'nextw': next_tensor
-                       },
-                       fetch_list=[avg_cost])
-        out = np.array(outs[0])
-        if out[0] < 10.0:
+        avg_cost_np = exe.run(fluid.default_main_program(),
+                              feed=feeder.feed(data),
+                              fetch_list=[avg_cost])
+        if avg_cost_np[0] < 5.0:
             exit(0)  # if avg cost less than 10.0, we think our code is good.
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_distribute/CMakeLists.txt b/python/paddle/v2/fluid/tests/book_distribute/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4d7664469e481344cf9eea84688f068b4fb99dee
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/CMakeLists.txt
@@ -0,0 +1,5 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
new file mode 100644
index 0000000000000000000000000000000000000000..b886071f94768d3373bbf0e0b7655c924b218645
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
@@ -0,0 +1,75 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import os
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+t = fluid.DistributeTranspiler()
+# all parameter server endpoints list for spliting parameters
+pserver_endpoints = os.getenv("PSERVERS")
+# server endpoint for current node
+current_endpoint = os.getenv("SERVER_ENDPOINT")
+# run as trainer or parameter server
+training_role = os.getenv("TRAINING_ROLE",
+                          "TRAINER")  # get the training role: trainer/pserver
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+if training_role == "PSERVER":
+    if not current_endpoint:
+        print("need env SERVER_ENDPOINT")
+        exit(1)
+    pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+    exe.run(fluid.default_startup_program())
+    exe.run(pserver_prog)
+else:
+    trainer_prog = t.get_trainer_program()
+
+    exe.run(fluid.default_startup_program())
+
+    PASS_NUM = 100
+    for pass_id in range(PASS_NUM):
+        fluid.io.save_persistables(exe, "./fit_a_line.model/")
+        fluid.io.load_persistables(exe, "./fit_a_line.model/")
+        for data in train_reader():
+            avg_loss_value, = exe.run(trainer_prog,
+                                      feed=feeder.feed(data),
+                                      fetch_list=[avg_cost])
+
+            if avg_loss_value[0] < 10.0:
+                exit(0)
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b5a098ff253b8a96afba7cd03d7f9998ff400af
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py
@@ -0,0 +1,238 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import math
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.fluid as fluid
+import time
+import os
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_len = len(verb_dict)
+
+mark_dict_len = 2
+word_dim = 32
+mark_dim = 5
+hidden_dim = 512
+depth = 8
+mix_hidden_lr = 1e-3
+
+IS_SPARSE = True
+PASS_NUM = 10
+BATCH_SIZE = 20
+
+embedding_name = 'emb'
+
+
+def load_parameter(file_name, h, w):
+    with open(file_name, 'rb') as f:
+        f.read(16)  # skip header.
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+
+def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
+            **ignored):
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        size=[pred_len, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark,
+        size=[mark_dict_len, mark_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[word_dict_len, word_dim],
+            input=x,
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=hidden_dim,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=hidden_dim,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
+    ])
+
+    return feature_out
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    # define network topology
+    word = fluid.layers.data(
+        name='word_data', shape=[1], dtype='int64', lod_level=1)
+    predicate = fluid.layers.data(
+        name='verb_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n2 = fluid.layers.data(
+        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n1 = fluid.layers.data(
+        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_0 = fluid.layers.data(
+        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p1 = fluid.layers.data(
+        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p2 = fluid.layers.data(
+        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+    mark = fluid.layers.data(
+        name='mark_data', shape=[1], dtype='int64', lod_level=1)
+    feature_out = db_lstm(**locals())
+    target = fluid.layers.data(
+        name='target', shape=[1], dtype='int64', lod_level=1)
+    crf_cost = fluid.layers.linear_chain_crf(
+        input=feature_out,
+        label=target,
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=mix_hidden_lr))
+    avg_cost = fluid.layers.mean(x=crf_cost)
+
+    # TODO(qiao)
+    # check other optimizers and check why out will be NAN
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+
+    # TODO(qiao)
+    # add dependency track and move this config before optimizer
+    crf_decode = fluid.layers.crf_decoding(
+        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
+
+    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
+        input=crf_decode,
+        label=target,
+        chunk_scheme="IOB",
+        num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.conll05.test(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    feeder = fluid.DataFeeder(
+        feed_list=[
+            word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
+        ],
+        place=place)
+    exe = fluid.Executor(place)
+
+    t = fluid.DistributeTranspiler()
+    pserver_endpoints = os.getenv("PSERVERS")
+    # server endpoint for current node
+    current_endpoint = os.getenv("SERVER_ENDPOINT")
+    # run as trainer or parameter server
+    training_role = os.getenv(
+        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
+    t.transpile(
+        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+    if training_role == "PSERVER":
+        if not current_endpoint:
+            print("need env SERVER_ENDPOINT")
+            exit(1)
+        pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+        exe.run(fluid.default_startup_program())
+        exe.run(pserver_prog)
+    elif training_role == "TRAINER":
+        trainer_prog = t.get_trainer_program()
+        start_time = time.time()
+        batch_id = 0
+        exe.run(fluid.default_startup_program())
+        embedding_param = fluid.global_scope().find_var(
+            embedding_name).get_tensor()
+        embedding_param.set(
+            load_parameter(conll05.get_embedding(), word_dict_len, word_dim),
+            place)
+        for pass_id in xrange(PASS_NUM):
+            chunk_evaluator.reset(exe)
+            for data in train_data():
+                cost, precision, recall, f1_score = exe.run(
+                    trainer_prog,
+                    feed=feeder.feed(data),
+                    fetch_list=[avg_cost] + chunk_evaluator.metrics)
+                pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
+                    exe)
+
+                if batch_id % 10 == 0:
+                    print("avg_cost:" + str(cost) + " precision:" + str(
+                        precision) + " recall:" + str(recall) + " f1_score:" +
+                          str(f1_score) + " pass_precision:" + str(
+                              pass_precision) + " pass_recall:" + str(
+                                  pass_recall) + " pass_f1_score:" + str(
+                                      pass_f1_score))
+                    if batch_id != 0:
+                        print("second per batch: " + str((time.time(
+                        ) - start_time) / batch_id))
+
+                batch_id = batch_id + 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc04af5b7b6ee143847685d6cf4da91747afd3ec
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py
@@ -0,0 +1,109 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import print_function
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import os
+
+PASS_NUM = 100
+EMBED_SIZE = 32
+HIDDEN_SIZE = 256
+N = 5
+BATCH_SIZE = 32
+IS_SPARSE = True
+TRAINERS = 2
+
+word_dict = paddle.dataset.imikolov.build_dict()
+dict_size = len(word_dict)
+
+first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+
+embed_first = fluid.layers.embedding(
+    input=first_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+embed_second = fluid.layers.embedding(
+    input=second_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+embed_third = fluid.layers.embedding(
+    input=third_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+embed_forth = fluid.layers.embedding(
+    input=forth_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+
+concat_embed = fluid.layers.concat(
+    input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
+predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
+cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
+avg_cost = fluid.layers.mean(x=cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+train_reader = paddle.batch(
+    paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+t = fluid.DistributeTranspiler()
+# all parameter server endpoints list for spliting parameters
+pserver_endpoints = os.getenv("PSERVERS")
+# server endpoint for current node
+current_endpoint = os.getenv("SERVER_ENDPOINT")
+# run as trainer or parameter server
+training_role = os.getenv("TRAINING_ROLE",
+                          "TRAINER")  # get the training role: trainer/pserver
+t.transpile(
+    optimize_ops, params_grads, pservers=pserver_endpoints, trainers=TRAINERS)
+if training_role == "PSERVER":
+    if not current_endpoint:
+        print("need env SERVER_ENDPOINT")
+        exit(1)
+    pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+    exe.run(fluid.default_startup_program())
+    exe.run(pserver_prog)
+elif training_role == "TRAINER":
+    feeder = fluid.DataFeeder(
+        feed_list=[first_word, second_word, third_word, forth_word, next_word],
+        place=place)
+    exe.run(fluid.default_startup_program())
+    for pass_id in range(PASS_NUM):
+        for data in train_reader():
+            avg_cost_np = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+            print("avg_cost_np", avg_cost_np)
+            if avg_cost_np[0] < 5.0:
+                exit(
+                    0)  # if avg cost less than 10.0, we think our code is good.
+else:
+    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..27512c4f7812b6b55d5dc6d1a12c3b83df8b3e6f
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
@@ -0,0 +1,93 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import print_function
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import os
+
+images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    input=images,
+    filter_size=5,
+    num_filters=20,
+    pool_size=2,
+    pool_stride=2,
+    act="relu")
+conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    input=conv_pool_1,
+    filter_size=5,
+    num_filters=50,
+    pool_size=2,
+    pool_stride=2,
+    act="relu")
+
+predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+BATCH_SIZE = 50
+PASS_NUM = 3
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+t = fluid.DistributeTranspiler()
+# all parameter server endpoints list for spliting parameters
+pserver_endpoints = os.getenv("PSERVERS")
+# server endpoint for current node
+current_endpoint = os.getenv("SERVER_ENDPOINT")
+# run as trainer or parameter server
+training_role = os.getenv("TRAINING_ROLE",
+                          "TRAINER")  # get the training role: trainer/pserver
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+if training_role == "PSERVER":
+    if not current_endpoint:
+        print("need env SERVER_ENDPOINT")
+        exit(1)
+    pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+    exe.run(fluid.default_startup_program())
+    exe.run(pserver_prog)
+elif training_role == "TRAINER":
+    trainer_prog = t.get_trainer_program()
+    feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in range(PASS_NUM):
+        accuracy.reset(exe)
+        batch_id = 0
+        for data in train_reader():
+            loss, acc = exe.run(trainer_prog,
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+            pass_acc = accuracy.eval(exe)
+            if batch_id % 100 == 0:
+                print("batch_id %d, loss: %f, acc: %f" %
+                      (batch_id, loss, pass_acc))
+            batch_id += 1
+
+        pass_acc = accuracy.eval(exe)
+        print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
+else:
+    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..74f20f3f4cc8c81a38c1ad8ab33df6a07fbcad44
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py
@@ -0,0 +1,123 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import print_function
+import os
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+
+def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
+                    hid_dim=32):
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=3,
+        act="tanh",
+        pool_type="sqrt")
+    conv_4 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=4,
+        act="tanh",
+        pool_type="sqrt")
+    prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    optimize_ops, params_grads = adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, accuracy.metrics[0], optimize_ops, params_grads
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    BATCH_SIZE = 100
+    PASS_NUM = 5
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost, accuracy, acc_out, optimize_ops, params_grads = convolution_net(
+        data, label, input_dim=dict_dim, class_dim=class_dim)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    t = fluid.DistributeTranspiler()
+
+    # all parameter server endpoints list for spliting parameters
+    pserver_endpoints = os.getenv("PSERVERS")
+    # server endpoint for current node
+    current_endpoint = os.getenv("SERVER_ENDPOINT")
+    # run as trainer or parameter server
+    training_role = os.getenv(
+        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
+    t.transpile(
+        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+    exe.run(fluid.default_startup_program())
+
+    if training_role == "PSERVER":
+        if not current_endpoint:
+            print("need env SERVER_ENDPOINT")
+            exit(1)
+        pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+        exe.run(pserver_prog)
+    elif training_role == "TRAINER":
+        trainer_prog = t.get_trainer_program()
+        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+
+        for pass_id in xrange(PASS_NUM):
+            accuracy.reset(exe)
+            for data in train_data():
+                cost_val, acc_val = exe.run(trainer_prog,
+                                            feed=feeder.feed(data),
+                                            fetch_list=[cost, acc_out])
+                pass_acc = accuracy.eval(exe)
+                print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
+                      " pass_acc=" + str(pass_acc))
+                if cost_val < 1.0 and pass_acc > 0.8:
+                    exit(0)
+    else:
+        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/test_split_var.py b/python/paddle/v2/fluid/tests/book_distribute/test_split_var.py
new file mode 100644
index 0000000000000000000000000000000000000000..f979f642d8f8cf5869cd74d6f89d1d01f5860504
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/test_split_var.py
@@ -0,0 +1,52 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import math
+import unittest
+from paddle.v2.fluid.distribute_transpiler import split_dense_variable
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import random
+
+
+class TestSplitVar(unittest.TestCase):
+    def test_check_output(self):
+        # split below shapes to 10 servers
+        shapes = [[3, 5], [1024], [28, 784], [8, 1020], [800, 10]]
+        expected_sizes = [
+            [15], [1024],
+            [2352, 2352, 2352, 2352, 2352, 2352, 2352, 2352, 2352, 784],
+            [2040, 2040, 2040, 2040],
+            [1150, 1150, 1150, 1150, 1150, 1150, 1100]
+        ]
+        var_list = []
+        program = fluid.Program()
+        for shape in shapes:
+            var = program.global_block().create_var(
+                name=str(random.randint(10000, 99999)),
+                persistable=True,
+                # dtype=core.VarDesc.VarType.LOD_TENSOR,
+                shape=shape)
+            var_list.append(var)
+        blocks = split_dense_variable(var_list, 10)
+        all_sizes = []
+        for s in expected_sizes:
+            for s2 in s:
+                all_sizes.append(s2)
+        for i, block_str in enumerate(blocks):
+            varname, block_id, size = block_str.split(":")
+            self.assertEqual(int(size), all_sizes[i])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/decorators.py b/python/paddle/v2/fluid/tests/decorators.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b314a15e1b054ece50ad5d697c5fac3bbfedbdc
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/decorators.py
@@ -0,0 +1,42 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import paddle.v2.fluid as fluid
+
+__all__ = ['many_times', 'prog_scope']
+
+
+def many_times(times):
+    def __impl__(fn):
+        def __fn__(*args, **kwargs):
+            for _ in range(times):
+                fn(*args, **kwargs)
+
+        return __fn__
+
+    return __impl__
+
+
+def prog_scope():
+    def __impl__(fn):
+        def __fn__(*args, **kwargs):
+            prog = fluid.Program()
+            startup_prog = fluid.Program()
+            scope = fluid.core.Scope()
+            with fluid.scope_guard(scope):
+                with fluid.program_guard(prog, startup_prog):
+                    fn(*args, **kwargs)
+
+        return __fn__
+
+    return __impl__
diff --git a/python/paddle/v2/fluid/tests/demo/fc_gan.py b/python/paddle/v2/fluid/tests/demo/fc_gan.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f9e8f950779be214a5fd18c4b9d3a0c3f74282b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/demo/fc_gan.py
@@ -0,0 +1,170 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import errno
+import math
+import os
+
+import matplotlib
+import numpy
+
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+
+NOISE_SIZE = 100
+NUM_PASS = 1000
+NUM_REAL_IMGS_IN_BATCH = 121
+NUM_TRAIN_TIMES_OF_DG = 3
+LEARNING_RATE = 2e-5
+
+
+def D(x):
+    hidden = fluid.layers.fc(input=x,
+                             size=200,
+                             act='relu',
+                             param_attr='D.w1',
+                             bias_attr='D.b1')
+    logits = fluid.layers.fc(input=hidden,
+                             size=1,
+                             act=None,
+                             param_attr='D.w2',
+                             bias_attr='D.b2')
+    return logits
+
+
+def G(x):
+    hidden = fluid.layers.fc(input=x,
+                             size=200,
+                             act='relu',
+                             param_attr='G.w1',
+                             bias_attr='G.b1')
+    img = fluid.layers.fc(input=hidden,
+                          size=28 * 28,
+                          act='tanh',
+                          param_attr='G.w2',
+                          bias_attr='G.b2')
+    return img
+
+
+def plot(gen_data):
+    gen_data.resize(gen_data.shape[0], 28, 28)
+    n = int(math.ceil(math.sqrt(gen_data.shape[0])))
+    fig = plt.figure(figsize=(n, n))
+    gs = gridspec.GridSpec(n, n)
+    gs.update(wspace=0.05, hspace=0.05)
+
+    for i, sample in enumerate(gen_data):
+        ax = plt.subplot(gs[i])
+        plt.axis('off')
+        ax.set_xticklabels([])
+        ax.set_yticklabels([])
+        ax.set_aspect('equal')
+        plt.imshow(sample.reshape(28, 28), cmap='Greys_r')
+
+    return fig
+
+
+def main():
+    try:
+        os.makedirs("./out")
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
+    startup_program = fluid.Program()
+    d_program = fluid.Program()
+    dg_program = fluid.Program()
+
+    with fluid.program_guard(d_program, startup_program):
+        img = fluid.layers.data(name='img', shape=[784], dtype='float32')
+        d_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+            x=D(img),
+            label=fluid.layers.data(
+                name='label', shape=[1], dtype='float32'))
+        d_loss = fluid.layers.mean(x=d_loss)
+
+    with fluid.program_guard(dg_program, startup_program):
+        noise = fluid.layers.data(
+            name='noise', shape=[NOISE_SIZE], dtype='float32')
+        g_img = G(x=noise)
+        g_program = dg_program.clone()
+        dg_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+            x=D(g_img),
+            label=fluid.layers.fill_constant_batch_size_like(
+                input=noise, dtype='float32', shape=[-1, 1], value=1.0))
+        dg_loss = fluid.layers.mean(x=dg_loss)
+
+    opt = fluid.optimizer.Adam(learning_rate=LEARNING_RATE)
+
+    opt.minimize(loss=d_loss, startup_program=startup_program)
+    opt.minimize(
+        loss=dg_loss,
+        startup_program=startup_program,
+        parameter_list=[
+            p.name for p in g_program.global_block().all_parameters()
+        ])
+    exe = fluid.Executor(fluid.CPUPlace())
+    exe.run(startup_program)
+
+    num_true = NUM_REAL_IMGS_IN_BATCH
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=60000),
+        batch_size=num_true)
+
+    for pass_id in range(NUM_PASS):
+        for batch_id, data in enumerate(train_reader()):
+            num_true = len(data)
+            n = numpy.random.uniform(
+                low=-1.0, high=1.0,
+                size=[num_true * NOISE_SIZE]).astype('float32').reshape(
+                    [num_true, NOISE_SIZE])
+            generated_img = exe.run(g_program,
+                                    feed={'noise': n},
+                                    fetch_list={g_img})[0]
+            real_data = numpy.array(map(lambda x: x[0], data)).astype('float32')
+            real_data = real_data.reshape(num_true, 784)
+            total_data = numpy.concatenate([real_data, generated_img])
+            total_label = numpy.concatenate([
+                numpy.ones(
+                    shape=[real_data.shape[0], 1], dtype='float32'),
+                numpy.zeros(
+                    shape=[real_data.shape[0], 1], dtype='float32')
+            ])
+            d_loss_np = exe.run(d_program,
+                                feed={'img': total_data,
+                                      'label': total_label},
+                                fetch_list={d_loss})[0]
+            for _ in xrange(NUM_TRAIN_TIMES_OF_DG):
+                n = numpy.random.uniform(
+                    low=-1.0, high=1.0,
+                    size=[2 * num_true * NOISE_SIZE]).astype('float32').reshape(
+                        [2 * num_true, NOISE_SIZE, 1, 1])
+                dg_loss_np = exe.run(dg_program,
+                                     feed={'noise': n},
+                                     fetch_list={dg_loss})[0]
+            print("Pass ID={0}, Batch ID={1}, D-Loss={2}, DG-Loss={3}".format(
+                pass_id, batch_id, d_loss_np, dg_loss_np))
+        # generate image each batch
+        fig = plot(generated_img)
+        plt.savefig(
+            'out/{0}.png'.format(str(pass_id).zfill(3)), bbox_inches='tight')
+        plt.close(fig)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py
index 90269e308a31d2606b23d741ce0d0fa91a0a6aeb..c3b2220e6e2ca6285e4b1193620e7c560b1f7bfa 100644
--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -1,10 +1,23 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 import random
 import itertools
 import paddle.v2.fluid.core as core
 import collections
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 from paddle.v2.fluid.op import Operator
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.framework import Program, OpProtoHolder
@@ -31,7 +44,8 @@ def create_op(scope, op_type, inputs, outputs, attrs):
             kwargs[in_name] = []
             if in_dup:
                 sub_in = inputs[in_name]
-                for sub_in_name, _ in sub_in:
+                for item in sub_in:
+                    sub_in_name, _ = item[0], item[1]
                     __create_var__(in_name, sub_in_name)
             else:
                 __create_var__(in_name, in_name)
@@ -41,7 +55,8 @@ def create_op(scope, op_type, inputs, outputs, attrs):
             kwargs[out_name] = []
             if out_dup:
                 sub_out = outputs[out_name]
-                for sub_out_name, _ in sub_out:
+                for item in sub_out:
+                    sub_out_name, _ = item[0], item[1]
                     __create_var__(out_name, sub_out_name)
             else:
                 __create_var__(out_name, out_name)
@@ -71,13 +86,15 @@ def set_input(scope, op, inputs, place):
         if in_name in inputs:
             if in_dup:
                 sub_in = inputs[in_name]
-                for sub_in_name, sub_in_val in sub_in:
+                for item in sub_in:
+                    sub_in_name, sub_in_val = item[0], item[1]
                     __set_input__(sub_in_name, sub_in_val)
             else:
                 __set_input__(in_name, inputs[in_name])
 
 
-def get_numeric_gradient(scope,
+def get_numeric_gradient(place,
+                         scope,
                          op,
                          inputs,
                          input_to_check,
@@ -85,17 +102,15 @@ def get_numeric_gradient(scope,
                          delta=0.005,
                          in_place=False):
     # FIXME: change this method by compile time concepts
-    set_input(scope, op, inputs, core.CPUPlace())
+    set_input(scope, op, inputs, place)
 
     def product(dim):
         return reduce(lambda a, b: a * b, dim, 1)
 
-    ctx = core.DeviceContext.create(core.CPUPlace())
-
     def get_output():
         sum = []
         for output_name in output_names:
-            op.run(scope, ctx)
+            op.run(scope, place)
             sum.append(
                 np.array(scope.find_var(output_name).get_tensor()).mean())
         return np.array(sum).mean()
@@ -129,7 +144,7 @@ def get_numeric_gradient(scope,
     # we use a for loop to compute the gradient of every element.
     for i in xrange(tensor_size):
         if in_place:
-            set_input(scope, op, inputs, core.CPUPlace())
+            set_input(scope, op, inputs, place)
 
         # get one input element throw it's index i.
         origin = __get_elem__(tensor_to_check, i)
@@ -139,7 +154,7 @@ def get_numeric_gradient(scope,
         y_pos = get_output()
 
         if in_place:
-            set_input(scope, op, inputs, core.CPUPlace())
+            set_input(scope, op, inputs, place)
 
         x_neg = origin - delta
         __set_elem__(tensor_to_check, i, x_neg)
@@ -261,7 +276,10 @@ class OpTest(unittest.TestCase):
         feed_map = self.feed_var(inputs, place)
 
         exe = Executor(place)
-        outs = exe.run(program, feed=feed_map, fetch_list=fetch_list)
+        outs = exe.run(program,
+                       feed=feed_map,
+                       fetch_list=fetch_list,
+                       return_numpy=False)
 
         for out_name, out_dup in Operator.get_op_outputs(self.op_type):
             if out_name not in self.outputs:
@@ -282,7 +300,8 @@ class OpTest(unittest.TestCase):
                 if not isinstance(sub_out, list):
                     raise AssertionError("sub_out type %s is not list",
                                          type(sub_out))
-                for sub_out_name, expect in sub_out:
+                for item in sub_out:
+                    sub_out_name, expect = item[0], item[1]
                     idx = find_actual(sub_out_name, fetch_list)
                     actual = outs[idx]
                     actual_t = np.array(actual)
@@ -315,7 +334,7 @@ class OpTest(unittest.TestCase):
     def check_output(self, atol=1e-5):
         places = [core.CPUPlace()]
         if core.is_compile_gpu() and core.op_support_gpu(self.op_type):
-            places.append(core.GPUPlace(0))
+            places.append(core.CUDAPlace(0))
         for place in places:
             self.check_output_with_place(place, atol)
 
@@ -346,6 +365,24 @@ class OpTest(unittest.TestCase):
                    in_place=False,
                    max_relative_error=0.005,
                    user_defined_grads=None):
+        places = [core.CPUPlace()]
+        if core.is_compile_gpu() and core.op_support_gpu(self.op_type):
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_grad_with_place(place, inputs_to_check, output_names,
+                                       no_grad_set, numeric_grad_delta,
+                                       in_place, max_relative_error,
+                                       user_defined_grads)
+
+    def check_grad_with_place(self,
+                              place,
+                              inputs_to_check,
+                              output_names,
+                              no_grad_set=None,
+                              numeric_grad_delta=0.005,
+                              in_place=False,
+                              max_relative_error=0.005,
+                              user_defined_grads=None):
         self.scope = core.Scope()
         op_inputs = self.inputs if hasattr(self, "inputs") else dict()
         op_outputs = self.outputs if hasattr(self, "outputs") else dict()
@@ -361,6 +398,7 @@ class OpTest(unittest.TestCase):
 
         numeric_grads = user_defined_grads or [
             get_numeric_gradient(
+                place,
                 self.scope,
                 self.op,
                 self.inputs,
@@ -369,22 +407,12 @@ class OpTest(unittest.TestCase):
                 delta=numeric_grad_delta,
                 in_place=in_place) for input_to_check in inputs_to_check
         ]
-        cpu_place = core.CPUPlace()
-        cpu_analytic_grads = self._get_gradient(inputs_to_check, cpu_place,
-                                                output_names, no_grad_set)
-
-        self.__assert_is_close(numeric_grads, cpu_analytic_grads,
-                               inputs_to_check, max_relative_error,
-                               "Gradient Check On %s" % str(cpu_place))
-
-        if core.is_compile_gpu() and self.op.support_gpu():
-            gpu_place = core.GPUPlace(0)
-            gpu_analytic_grads = self._get_gradient(inputs_to_check, gpu_place,
-                                                    output_names, no_grad_set)
+        analytic_grads = self._get_gradient(inputs_to_check, place,
+                                            output_names, no_grad_set)
 
-            self.__assert_is_close(numeric_grads, gpu_analytic_grads,
-                                   inputs_to_check, max_relative_error,
-                                   "Gradient Check On %s" % str(gpu_place))
+        self.__assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
+                               max_relative_error,
+                               "Gradient Check On %s" % str(place))
 
     @staticmethod
     def _create_var_descs_(block, var_dict):
@@ -458,7 +486,7 @@ class OpTest(unittest.TestCase):
         mean_inputs = map(block.var, output_names)
 
         if len(mean_inputs) == 1:
-            loss = block.create_var(dtype=mean_inputs[0].data_type, shape=[1])
+            loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
             op = block.append_op(
                 inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
             op.desc.infer_var_type(block.desc)
@@ -466,8 +494,7 @@ class OpTest(unittest.TestCase):
         else:
             avg_sum = []
             for cur_loss in mean_inputs:
-                cur_avg_loss = block.create_var(
-                    dtype=cur_loss.data_type, shape=[1])
+                cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
                 op = block.append_op(
                     inputs={"X": [cur_loss]},
                     outputs={"Out": [cur_avg_loss]},
@@ -476,13 +503,13 @@ class OpTest(unittest.TestCase):
                 op.desc.infer_shape(block.desc)
                 avg_sum.append(cur_avg_loss)
 
-            loss_sum = block.create_var(dtype=avg_sum[0].data_type, shape=[1])
+            loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
             op_sum = block.append_op(
                 inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
             op_sum.desc.infer_var_type(block.desc)
             op_sum.desc.infer_shape(block.desc)
 
-            loss = block.create_var(dtype=loss_sum.data_type, shape=[1])
+            loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
             op_loss = block.append_op(
                 inputs={"X": loss_sum},
                 outputs={"Out": loss},
@@ -491,7 +518,7 @@ class OpTest(unittest.TestCase):
             op_loss.desc.infer_var_type(block.desc)
             op_loss.desc.infer_shape(block.desc)
 
-        param_grad_list = append_backward_ops(
+        param_grad_list = append_backward(
             loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set)
 
         feed_dict = {
@@ -501,5 +528,6 @@ class OpTest(unittest.TestCase):
 
         fetch_list = [g for p, g in param_grad_list]
         executor = Executor(place)
-        result = executor.run(prog, feed_dict, fetch_list)
-        return map(np.array, result)
+        return map(
+            np.array,
+            executor.run(prog, feed_dict, fetch_list, return_numpy=False))
diff --git a/python/paddle/v2/fluid/tests/test_accuracy_op.py b/python/paddle/v2/fluid/tests/test_accuracy_op.py
index 6f72918b7178bc1f856010f1111f18842f6cc34a..a20abac8a0ce689e1c49f7f7e082137d2cb3fbb4 100644
--- a/python/paddle/v2/fluid/tests/test_accuracy_op.py
+++ b/python/paddle/v2/fluid/tests/test_accuracy_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_activation_op.py b/python/paddle/v2/fluid/tests/test_activation_op.py
index 7649e60a3833e34523d87cb963af3888c3cef65d..a6a6eb9d635ed3ece0a0f22d629955b06098e321 100644
--- a/python/paddle/v2/fluid/tests/test_activation_op.py
+++ b/python/paddle/v2/fluid/tests/test_activation_op.py
@@ -1,6 +1,20 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
+from scipy.special import expit
 
 
 class TestExp(OpTest):
@@ -9,13 +23,13 @@ class TestExp(OpTest):
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
         }
-        self.outputs = {'Y': np.exp(self.inputs['X'])}
+        self.outputs = {'Out': np.exp(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestSigmoid(OpTest):
@@ -24,13 +38,13 @@ class TestSigmoid(OpTest):
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
         }
-        self.outputs = {'Y': 1 / (1 + np.exp(-self.inputs['X']))}
+        self.outputs = {'Out': 1 / (1 + np.exp(-self.inputs['X']))}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.008)
+        self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
 
 class TestLogSigmoid(OpTest):
@@ -39,13 +53,13 @@ class TestLogSigmoid(OpTest):
         self.inputs = {
             'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
         }
-        self.outputs = {'Y': np.log(1 / (1 + np.exp(-self.inputs['X'])))}
+        self.outputs = {'Out': np.log(1 / (1 + np.exp(-self.inputs['X'])))}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.008)
+        self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
 
 class TestTanh(OpTest):
@@ -54,13 +68,13 @@ class TestTanh(OpTest):
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
         }
-        self.outputs = {'Y': np.tanh(self.inputs['X'])}
+        self.outputs = {'Out': np.tanh(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestTanhShrink(OpTest):
@@ -69,13 +83,13 @@ class TestTanhShrink(OpTest):
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [10, 17]).astype("float32")
         }
-        self.outputs = {'Y': self.inputs['X'] - np.tanh(self.inputs['X'])}
+        self.outputs = {'Out': self.inputs['X'] - np.tanh(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.008)
+        self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
 
 class TestHardShrink(OpTest):
@@ -89,13 +103,13 @@ class TestHardShrink(OpTest):
 
         t = np.copy(x)
         t[(t >= -threshold) & (t <= threshold)] = 0
-        self.outputs = {'Y': t}
+        self.outputs = {'Out': t}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.005)
+        self.check_grad(['X'], 'Out', max_relative_error=0.005)
 
 
 class TestSoftShrink(OpTest):
@@ -109,13 +123,13 @@ class TestSoftShrink(OpTest):
         y = np.copy(self.inputs['X'])
         y = (y < -lambda_val) * (y + lambda_val) + (y > lambda_val) * (
             y - lambda_val)
-        self.outputs = {'Y': y}
+        self.outputs = {'Out': y}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestSqrt(OpTest):
@@ -124,13 +138,13 @@ class TestSqrt(OpTest):
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
         }
-        self.outputs = {'Y': np.sqrt(self.inputs['X'])}
+        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestAbs(OpTest):
@@ -143,13 +157,56 @@ class TestAbs(OpTest):
         # we should avoid this
         x[np.abs(x) < 0.005] = 0.02
         self.inputs = {'X': x}
-        self.outputs = {'Y': np.abs(self.inputs['X'])}
+        self.outputs = {'Out': np.abs(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestCeil(OpTest):
+    def setUp(self):
+        self.op_type = "ceil"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.ceil(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestFloor(OpTest):
+    def setUp(self):
+        self.op_type = "floor"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        # numpy floor need +1
+        self.outputs = {'Out': np.floor(self.inputs['X']) + 1.0}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestRound(OpTest):
+    def setUp(self):
+        self.op_type = "round"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.round(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestRelu(OpTest):
@@ -159,13 +216,13 @@ class TestRelu(OpTest):
         # The same reason with TestAbs
         x[np.abs(x) < 0.005] = 0.02
         self.inputs = {'X': x}
-        self.outputs = {'Y': np.maximum(self.inputs['X'], 0)}
+        self.outputs = {'Out': np.maximum(self.inputs['X'], 0)}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestBRelu(OpTest):
@@ -183,13 +240,13 @@ class TestBRelu(OpTest):
         t = np.copy(x)
         t[t < t_min] = t_min
         t[t > t_max] = t_max
-        self.outputs = {'Y': t}
+        self.outputs = {'Out': t}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.02)
+        self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
 
 class TestRelu6(OpTest):
@@ -204,14 +261,14 @@ class TestRelu6(OpTest):
         self.inputs = {'X': x}
         self.attrs = {'threshold': threshold}
         self.outputs = {
-            'Y': np.minimum(np.maximum(self.inputs['X'], 0), threshold)
+            'Out': np.minimum(np.maximum(self.inputs['X'], 0), threshold)
         }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.02)
+        self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
 
 class TestSoftRelu(OpTest):
@@ -227,13 +284,13 @@ class TestSoftRelu(OpTest):
         t = np.copy(x)
         t[t < -threshold] = -threshold
         t[t > threshold] = threshold
-        self.outputs = {'Y': np.log((np.exp(t) + 1))}
+        self.outputs = {'Out': np.log((np.exp(t) + 1))}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.02)
+        self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
 
 class TestELU(OpTest):
@@ -246,27 +303,27 @@ class TestELU(OpTest):
         self.inputs = {'X': x}
         self.attrs = {'alpha': alpha}
         self.outputs = {
-            'Y': np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
+            'Out': np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
         }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.02)
+        self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
 
 class TestReciprocal(OpTest):
     def setUp(self):
         self.op_type = "reciprocal"
         self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
-        self.outputs = {'Y': np.reciprocal(self.inputs['X'])}
+        self.outputs = {'Out': np.reciprocal(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.01)
+        self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
 class TestLog(OpTest):
@@ -275,13 +332,13 @@ class TestLog(OpTest):
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
         }
-        self.outputs = {'Y': np.log(self.inputs['X'])}
+        self.outputs = {'Out': np.log(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestSquare(OpTest):
@@ -290,13 +347,13 @@ class TestSquare(OpTest):
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
         }
-        self.outputs = {'Y': np.square(self.inputs['X'])}
+        self.outputs = {'Out': np.square(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestPow(OpTest):
@@ -304,13 +361,13 @@ class TestPow(OpTest):
         self.op_type = "pow"
         self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
         self.attrs = {'factor': 3.0}
-        self.outputs = {'Y': np.power(self.inputs['X'], 3)}
+        self.outputs = {'Out': np.power(self.inputs['X'], 3)}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.02)
+        self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
 
 class TestSTanh(OpTest):
@@ -322,13 +379,13 @@ class TestSTanh(OpTest):
         scale_a = 2.0 / 3.0
         scale_b = 1.7159
         self.attrs = {'scale_a': scale_a, 'scale_b': scale_b}
-        self.outputs = {'Y': scale_b * np.tanh(self.inputs['X'] * scale_a)}
+        self.outputs = {'Out': scale_b * np.tanh(self.inputs['X'] * scale_a)}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestSoftplus(OpTest):
@@ -337,13 +394,13 @@ class TestSoftplus(OpTest):
         self.inputs = {
             'X': np.random.uniform(-1, 1, [11, 17]).astype("float64")
         }
-        self.outputs = {'Y': np.log(1 + np.exp(self.inputs['X']))}
+        self.outputs = {'Out': np.log(1 + np.exp(self.inputs['X']))}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestSoftsign(OpTest):
@@ -353,14 +410,14 @@ class TestSoftsign(OpTest):
             'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
         }
         self.outputs = {
-            'Y': np.divide(self.inputs['X'], 1 + np.abs(self.inputs['X']))
+            'Out': np.divide(self.inputs['X'], 1 + np.abs(self.inputs['X']))
         }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
 class TestThresholdedRelu(OpTest):
@@ -375,13 +432,13 @@ class TestThresholdedRelu(OpTest):
 
         self.inputs = {'X': X}
         self.attrs = {'threshold': threshold}
-        self.outputs = {'Y': (X > threshold) * X}
+        self.outputs = {'Out': (X > threshold) * X}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=self.relative_error)
+        self.check_grad(['X'], 'Out', max_relative_error=self.relative_error)
 
 
 class TestHardSigmoid(OpTest):
@@ -403,13 +460,28 @@ class TestHardSigmoid(OpTest):
             upper_threshold - 0.2
 
         temp = X * slope + offset
-        self.outputs = {'Y': np.maximum(0.0, np.minimum(1.0, temp))}
+        self.outputs = {'Out': np.maximum(0.0, np.minimum(1.0, temp))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.002)
+
+
+class TestSwish(OpTest):
+    def setUp(self):
+        self.op_type = "swish"
+        X = np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        self.inputs = {'X': X}
+        self.attrs = {'beta': 2.3}
+        self.outputs = {'Out': X * expit(self.attrs['beta'] * X)}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.002)
+        self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/v2/fluid/tests/test_adadelta_op.py b/python/paddle/v2/fluid/tests/test_adadelta_op.py
index 7105593a98aee9885ba16e3ee0649a6024033ee7..8de6a1f9a9da4834d7b52baade12bb15ef128cad 100644
--- a/python/paddle/v2/fluid/tests/test_adadelta_op.py
+++ b/python/paddle/v2/fluid/tests/test_adadelta_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_adagrad_op.py b/python/paddle/v2/fluid/tests/test_adagrad_op.py
index 903e84c32887100bbeef6ebf81f66f06f084fab5..30ed092d4894f5c28775748a71f474c46a1cb2d3 100644
--- a/python/paddle/v2/fluid/tests/test_adagrad_op.py
+++ b/python/paddle/v2/fluid/tests/test_adagrad_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 import paddle.v2.fluid.core as core
@@ -113,8 +126,7 @@ class TestSparseAdagradOp(unittest.TestCase):
             LearningRate='LearningRate',
             epsilon=2.0)
 
-        ctx = core.DeviceContext.create(place)
-        adagrad_op.run(scope, ctx)
+        adagrad_op.run(scope, place)
 
         # get and compare moment result
         moment_result_array = np.array(moment)
@@ -168,7 +180,7 @@ class TestSparseAdagradOp(unittest.TestCase):
     def test_sparse_adagrad(self):
         places = [core.CPUPlace()]
         if core.is_compile_gpu():
-            places.append(core.GPUPlace(0))
+            places.append(core.CUDAPlace(0))
         for place in places:
             self.check_with_place(place)
 
diff --git a/python/paddle/v2/fluid/tests/test_adam_op.py b/python/paddle/v2/fluid/tests/test_adam_op.py
index a0d6655d4cbcff8ed3d55df0f4e68fc6591fbb11..32d00cf702e7fdca2b9f975dcaf93d721fe6cce1 100644
--- a/python/paddle/v2/fluid/tests/test_adam_op.py
+++ b/python/paddle/v2/fluid/tests/test_adam_op.py
@@ -1,6 +1,21 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
+from paddle.v2.fluid import core
+from paddle.v2.fluid.op import Operator
 
 
 class TestAdamOp1(OpTest):
@@ -176,5 +191,124 @@ def adam_step(inputs, attributes):
     return param_out, moment1_out, moment2_out
 
 
+def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
+    '''
+    Simulate one step of the adam optimizer
+    :param inputs: dict of inputs
+    :param attributes: dict of attributes
+    :return tuple: tuple of output param, moment1, moment2,
+    beta1 power accumulator and beta2 power accumulator
+    '''
+    param = inputs['Param']
+    # grad = inputs['Grad']
+    moment1 = inputs['Moment1']
+    moment2 = inputs['Moment2']
+    lr = inputs['LearningRate']
+    beta1_pow = inputs['Beta1Pow']
+    beta2_pow = inputs['Beta2Pow']
+
+    beta1 = attributes['beta1']
+    beta2 = attributes['beta2']
+    epsilon = attributes['epsilon']
+
+    moment1_out = np.zeros(shape=[height, row_numel])
+    moment2_out = np.zeros(shape=[height, row_numel])
+    param_out = np.zeros(shape=[height, row_numel])
+
+    for idx, row_id in enumerate(rows):
+        moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
+                                                         ) * np_grad[idx]
+        moment2_out[row_id] = beta2 * moment2[row_id] + (
+            1 - beta2) * np.square(np_grad[idx])
+        lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
+        param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / (
+            np.sqrt(moment2_out[row_id]) + epsilon))
+    return param_out, moment1_out, moment2_out
+
+
+class TestSparseAdamOp(unittest.TestCase):
+    def setup(self, scope, place):
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+
+        height = 10
+        rows = [0, 4, 7]
+        self.rows = rows
+        row_numel = 12
+        self.row_numel = row_numel
+        self.dense_inputs = {
+            "Param": np.full((height, row_numel), 5.0).astype("float32"),
+            "Moment1": np.full((height, row_numel), 5.0).astype("float32"),
+            "Moment2": np.full((height, row_numel), 5.0).astype("float32"),
+            'Beta1Pow': np.array([beta1**10]).astype("float32"),
+            'Beta2Pow': np.array([beta2**10]).astype("float32"),
+            "LearningRate": np.full((1), 2.0).astype("float32")
+        }
+        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(height)
+        grad_selected_rows.set_rows(rows)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        np_array[2, 8] = 4.0
+
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(np_array, place)
+
+        self.sparse_inputs = ["Grad"]
+
+        param_out, mom1, mom2 = adam_step_sparse(
+            self.dense_inputs, self.attrs, height, rows, row_numel, np_array)
+        self.outputs = {
+            "ParamOut": param_out,
+            "Moment1Out": mom1,
+            "Moment2Out": mom2
+        }
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        self.setup(scope, place)
+
+        op_args = dict()
+        for key, np_array in self.dense_inputs.iteritems():
+            var = scope.var(key).get_tensor()
+            var.set(np_array, place)
+            op_args[key] = key
+        for s in self.sparse_inputs:
+            op_args[s] = s
+        for s in self.outputs:
+            var = scope.var(s).get_tensor()
+            var.set(self.outputs[s], place)
+            op_args[s] = s
+        for k in self.attrs:
+            op_args[k] = self.attrs[k]
+
+        # create and run sgd operator
+        adam_op = Operator("adam", **op_args)
+        adam_op.run(scope, place)
+
+        for key, np_array in self.outputs.iteritems():
+            out_var = scope.var(key).get_tensor()
+            actual = np.array(out_var)
+            actual = actual.reshape([actual.size])
+            np_array = np_array.reshape([np_array.size])
+            for idx, row_id in enumerate(self.rows):
+                j = 0
+                while j < self.row_numel:
+                    pos = row_id * self.row_numel + j
+                    self.assertLess((actual[pos] - np_array[pos]) / actual[pos],
+                                    0.00001)
+                    j += 1
+
+    def test_sparse_sgd(self):
+        places = [core.CPUPlace()]
+        if core.is_compile_gpu():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_adamax_op.py b/python/paddle/v2/fluid/tests/test_adamax_op.py
index 8e5a15aa3d12bbaae99cae6fcb627a336e48f684..35b2bc47ed62cb21b6e58a172e5b7e4d34f52eb4 100644
--- a/python/paddle/v2/fluid/tests/test_adamax_op.py
+++ b/python/paddle/v2/fluid/tests/test_adamax_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_array_read_write_op.py b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
index e019a4e15f0e25deaedf30911b44e576c8f89013..8775cd4f9fb93e439920e29df4b525485254754c 100644
--- a/python/paddle/v2/fluid/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
@@ -1,9 +1,22 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import paddle.v2.fluid.core as core
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.backward import append_backward
+from paddle.v2.fluid.framework import default_main_program
 import numpy
 
 
@@ -52,32 +65,29 @@ class TestArrayReadWrite(unittest.TestCase):
 
         exe = Executor(cpu)
 
-        tensor = core.LoDTensor()
-        tensor.set(numpy.random.random(size=(100, 100)).astype('float32'), cpu)
+        tensor = numpy.random.random(size=(100, 100)).astype('float32')
 
-        outs = map(numpy.array,
-                   exe.run(feed={'x0': tensor,
-                                 'x1': tensor,
-                                 'x2': tensor},
-                           fetch_list=[a_sum, x_sum],
-                           scope=scope))
+        outs = exe.run(feed={'x0': tensor,
+                             'x1': tensor,
+                             'x2': tensor},
+                       fetch_list=[a_sum, x_sum],
+                       scope=scope)
         self.assertEqual(outs[0], outs[1])
 
         total_sum = layers.sums(input=[a_sum, x_sum])
         total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)
 
-        append_backward_ops(total_sum_scaled)
+        append_backward(total_sum_scaled)
 
-        g_vars = map(g_main_program.global_block().var,
+        g_vars = map(default_main_program().global_block().var,
                      [each_x.name + "@GRAD" for each_x in x])
         g_out = [
             item.sum()
-            for item in map(
-                numpy.array,
-                exe.run(feed={'x0': tensor,
-                              'x1': tensor,
-                              'x2': tensor},
-                        fetch_list=g_vars))
+            for item in exe.run(
+                feed={'x0': tensor,
+                      'x1': tensor,
+                      'x2': tensor},
+                fetch_list=g_vars)
         ]
         g_out_sum = numpy.array(g_out).sum()
 
diff --git a/python/paddle/v2/fluid/tests/test_assign_op.py b/python/paddle/v2/fluid/tests/test_assign_op.py
index 1b0c145f1a69678b228bc70e4e4e273f5bcf9888..4ac173c96bd2c02fd0704d8e2c22faa96b65714e 100644
--- a/python/paddle/v2/fluid/tests/test_assign_op.py
+++ b/python/paddle/v2/fluid/tests/test_assign_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import op_test
 import numpy
 import unittest
diff --git a/python/paddle/v2/fluid/tests/test_assign_value_op.py b/python/paddle/v2/fluid/tests/test_assign_value_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4e2ff9bdeb88cc1fa19055a8e1ff4e6156f0477
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_assign_value_op.py
@@ -0,0 +1,53 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import op_test
+import numpy
+import unittest
+import paddle.v2.fluid.framework as framework
+
+
+class TestAssignValueOp(op_test.OpTest):
+    def setUp(self):
+        self.op_type = "assign_value"
+        x = numpy.random.random(size=(2, 5)).astype(numpy.float32)
+        self.inputs = {}
+        self.outputs = {'Out': x}
+        self.attrs = {
+            'shape': x.shape,
+            'dtype': framework.convert_np_dtype_to_dtype_(x.dtype),
+            'fp32_values': [float(v) for v in x.flat]
+        }
+
+    def test_forward(self):
+        self.check_output()
+
+    def test_assign(self):
+        val = (
+            -100 + 200 * numpy.random.random(size=(2, 5))).astype(numpy.int32)
+        x = layers.create_tensor(dtype="float32")
+        layers.assign(input=val, output=x)
+        exe = fluid.Executor(fluid.CPUPlace())
+        fetched_x = exe.run(fluid.default_main_program(),
+                            feed={},
+                            fetch_list=[x])[0]
+        self.assertTrue(
+            numpy.array_equal(fetched_x, val),
+            "fetch_x=%s val=%s" % (fetched_x, val))
+        self.assertEqual(fetched_x.dtype, val.dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_auc_op.py b/python/paddle/v2/fluid/tests/test_auc_op.py
index 26ea905d88093605dff820b178996a5724becf82..aa74d224d5dcdcce4c478df8f85fb45d05d87f32 100644
--- a/python/paddle/v2/fluid/tests/test_auc_op.py
+++ b/python/paddle/v2/fluid/tests/test_auc_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
index 71f9599e0de83c86808f7e62547f80d3d50ffc7d..fe82b7d7f31aa58d38bddba72e64d471c0a2eec0 100644
--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -1,12 +1,22 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.op import Operator
-
-
-def grad_var_name(var_name):
-    return var_name + "@GRAD"
+from paddle.v2.fluid.framework import grad_var_name
 
 
 def get_backward_op(scope, op, no_grad_set):
@@ -21,6 +31,13 @@ def get_backward_op(scope, op, no_grad_set):
 
 
 def _reference_training(x, scale, offset, epsilon, data_format):
+    x_shape = x.shape
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+
     if data_format == "NCHW":
         n, c, h, w = x.shape
         x_square = x * x
@@ -39,6 +56,8 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         offset_tile = np.reshape(offset, (1, c, 1, 1))
         offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
         y = normalized * scale_tile + offset_tile
+        if len(x_shape) == 2:
+            y = np.reshape(y, (y.shape[0], y.shape[1]))
         return y, mean, var
     elif data_format == "NHWC":
         x_square = x * x
@@ -48,7 +67,10 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         mean = x_sum / element_count
         var = x_square_sum / element_count - mean * mean
         normalized = (x - mean) / np.sqrt(var + epsilon)
-        return (normalized * scale + offset), mean, var
+        y = normalized * scale + offset
+        if len(x_shape) == 2:
+            y = np.reshape(y, x_shape)
+        return y, mean, var
     else:
         raise ValueError("Unknown data order.")
 
@@ -65,6 +87,18 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
     #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
 
     # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
+    x_shape = x.shape
+
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+            grad_y = np.reshape(grad_y,
+                                (grad_y.shape[0], grad_y.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+            grad_y = np.reshape(grad_y,
+                                (grad_y.shape[0], 1, 1, grad_y.shape[1]))
+
     if data_format == "NCHW":
         x = np.transpose(x, (0, 2, 3, 1))
         grad_y = np.transpose(grad_y, (0, 2, 3, 1))
@@ -83,6 +117,9 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
         grad_x = np.transpose(grad_x, (0, 3, 1, 2))
         x = np.transpose(x, (0, 3, 1, 2))
         grad_y = np.transpose(grad_y, (0, 3, 1, 2))
+
+    if len(x_shape) == 2:
+        grad_x = np.reshape(grad_x, x_shape)
     return grad_x, grad_scale, grad_offset
 
 
@@ -127,7 +164,7 @@ class TestBatchNormOp(OpTest):
         momentum = 0.9
 
         # N, H, W, C: 2, 3, 4, 2
-        n, h, w, c = 2, 3, 4, 2
+        n, h, w, c = 2, 3, 4, 5
         x_shape = [n, h, w, c]
         scale_shape = [c]
 
@@ -184,20 +221,23 @@ class TestBatchNormOp(OpTest):
         print 'python: NHWC, NCHW, backward checking passed'
 
     def test_forward_backward(self):
-        def test_with_place(place, tensor_format):
+        def test_with_place(place, data_layout, shape):
             # attr
             epsilon = 0.00001
             momentum = 0.9
 
-            # N, H, W, C: 12, 3, 4, 2
-            n, h, w, c = 2, 3, 4, 2
-
-            if data_format == "NHWC":
-                x_shape = [n, h, w, c]
-            elif data_format == "NCHW":
-                x_shape = [n, c, h, w]
+            if len(shape) == 2:
+                x_shape = shape
+                c = shape[1]
             else:
-                raise ValueError("Unknown data type.")
+                # n, h, w, c = 2, 3, 4, 2
+                n, h, w, c = shape[0], shape[1], shape[2], shape[3]
+                if data_format == "NHWC":
+                    x_shape = [n, h, w, c]
+                elif data_format == "NCHW":
+                    x_shape = [n, c, h, w]
+                else:
+                    raise ValueError("Unknown data type.")
             scale_shape = [c]
 
             x_val = np.random.random_sample(x_shape).astype(np.float32)
@@ -219,7 +259,10 @@ class TestBatchNormOp(OpTest):
             #  for gradient test
             # y_grad = np.ones(x_shape).astype(np.float32)
             y_grad = np.zeros(x_shape).astype(np.float32)
-            y_grad[0, 0, 0, 0] = 1.
+            if len(y_grad.shape) == 2:
+                y_grad[0, 0] = 1.
+            else:
+                y_grad[0, 0, 0, 0] = 1.
             # y_grad = np.random.random_sample(x_shape).astype(np.float32)
             x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
                 x_val, y_grad, scale_val, saved_mean, var_ref, epsilon,
@@ -262,12 +305,11 @@ class TestBatchNormOp(OpTest):
                 SavedVariance="saved_variance",
                 # attrs
                 is_test=False,
-                tensor_format=tensor_format,
+                data_layout=data_layout,
                 momentum=momentum,
                 epsilon=epsilon)
 
-            ctx = core.DeviceContext.create(place)
-            batch_norm_op.run(scope, ctx)
+            batch_norm_op.run(scope, place)
 
             # check forward result
             self.__assert_close(y_tensor, y_out, "y_out")
@@ -275,13 +317,13 @@ class TestBatchNormOp(OpTest):
             self.__assert_close(saved_variance_tensor, saved_variance,
                                 "saved_variance")
             self.__assert_close(mean_out_tensor, mean_out, "mean_out")
-            if isinstance(place, core.GPUPlace):
+            if isinstance(place, core.CUDAPlace):
                 atol = 5e-2
             else:
                 atol = 1e-4
             self.__assert_close(variance_out_tensor, variance_out,
                                 "variance_out", atol)
-            print "op test forward passed: ", str(place), tensor_format
+            print "op test forward passed: ", str(place), data_layout
 
             # run backward
             batch_norm_op_grad = get_backward_op(scope, batch_norm_op, set())
@@ -290,7 +332,7 @@ class TestBatchNormOp(OpTest):
                 ["y_out", "mean", "variance", "saved_mean", "saved_variance"],
                 place,
                 feed_dict={"y_out": y_grad})
-            batch_norm_op_grad.run(scope, ctx)
+            batch_norm_op_grad.run(scope, place)
 
             x_grad_tensor = create_or_get_tensor(scope,
                                                  grad_var_name("x_val"), None,
@@ -306,14 +348,16 @@ class TestBatchNormOp(OpTest):
             self.__assert_close(x_grad_tensor, x_grad_ref, "x_grad")
             self.__assert_close(scale_grad_tensor, scale_grad_ref, "scale_grad")
             self.__assert_close(bias_grad_tensor, bias_grad_ref, "bias_grad")
-            print "op test backward passed: ", str(place), tensor_format
+            print "op test backward passed: ", str(place), data_layout
 
         places = [core.CPUPlace()]
         if core.is_compile_gpu() and core.op_support_gpu("batch_norm"):
-            places.append(core.GPUPlace(0))
+            places.append(core.CUDAPlace(0))
+
         for place in places:
             for data_format in ["NCHW", "NHWC"]:
-                test_with_place(place, data_format)
+                test_with_place(place, data_format, [2, 3, 4, 5])
+                test_with_place(place, data_format, [2, 3])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
index 8a11820d2aba2dd4d17d925f0e0fe9f324100418..9ef6e08cc189035c55d52ecaf209d5d607de0ed0 100644
--- a/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
+++ b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 
 import numpy as np
@@ -35,15 +48,15 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
         self.append_lod_tensor(
             scores, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
             np.array(
-                [1, 2, 3, 4, 5, 6], dtype="float32"))
+                [1, 2, 3, 4, 5, 6], dtype="float64"))
         self.append_lod_tensor(
             scores, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
             np.array(
-                [0, 1, 2, 3, 4, 5], dtype="float32"))
+                [0, 1, 2, 3, 4, 5], dtype="float64"))
         self.append_lod_tensor(
             scores, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
             np.array(
-                [0, 1, 2, 3, 4], dtype="float32"))
+                [0, 1, 2, 3, 4], dtype="float64"))
 
         sentence_ids = self.scope.var("sentence_ids").get_tensor()
         sentence_scores = self.scope.var("sentence_scores").get_tensor()
@@ -57,8 +70,7 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
             SentenceIds="sentence_ids",
             SentenceScores="sentence_scores")
 
-        ctx = core.DeviceContext.create(self.cpu_place)
-        beam_search_decode_op.run(self.scope, ctx)
+        beam_search_decode_op.run(self.scope, self.cpu_place)
 
         expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
         self.assertEqual(sentence_ids.lod(), expected_lod)
diff --git a/python/paddle/v2/fluid/tests/test_beam_search_op.py b/python/paddle/v2/fluid/tests/test_beam_search_op.py
index cc7c09bb59de3f83e47b4d95c1203f7f050c5132..f31c737ba6cb7d17405e96506834627c9c5761b4 100644
--- a/python/paddle/v2/fluid/tests/test_beam_search_op.py
+++ b/python/paddle/v2/fluid/tests/test_beam_search_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import logging
 from paddle.v2.fluid.op import Operator, DynamicRecurrentOp
 import paddle.v2.fluid.core as core
@@ -14,7 +27,6 @@ def create_tensor(scope, name, np_data):
 class BeamSearchOpTester(unittest.TestCase):
     def setUp(self):
         self.scope = core.Scope()
-        self.ctx = core.DeviceContext.create(core.CPUPlace())
         self._create_ids()
         self._create_scores()
         self._create_pre_ids()
@@ -32,19 +44,19 @@ class BeamSearchOpTester(unittest.TestCase):
             level=0,
             beam_size=2,
             end_id=0, )
-        op.run(self.scope, self.ctx)
+        op.run(self.scope, core.CPUPlace())
         selected_ids = self.scope.find_var("selected_ids").get_tensor()
         print 'selected_ids', np.array(selected_ids)
         print 'lod', selected_ids.lod()
 
     def _create_pre_ids(self):
-        np_data = np.array([[1, 2, 3, 4]], dtype='int32')
+        np_data = np.array([[1, 2, 3, 4]], dtype='int64')
         tensor = create_tensor(self.scope, "pre_ids", np_data)
 
     def _create_ids(self):
         self.lod = [[0, 1, 4], [0, 1, 2, 3, 4]]
         np_data = np.array(
-            [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int32')
+            [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int64')
         tensor = create_tensor(self.scope, "ids", np_data)
         tensor.set_lod(self.lod)
 
diff --git a/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py b/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py
index 080ca43b8269e0f6a9f4d0ce3973f4d4a07a8e2a..aed1bf4d3ae867dafdefe67d36bb350453e3ede0 100644
--- a/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py
+++ b/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_calc_gradient.py b/python/paddle/v2/fluid/tests/test_calc_gradient.py
new file mode 100644
index 0000000000000000000000000000000000000000..b99eeb09cdbb7c2174ea5f29ba254ce342517f9c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_calc_gradient.py
@@ -0,0 +1,38 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.optimizer as optimizer
+from paddle.v2.fluid.backward import calc_gradient
+
+
+class TestCalcGradient(unittest.TestCase):
+    def test_calc_gradient(self):
+        x = layers.create_parameter(dtype="float32", shape=[5, 10])
+        y = layers.create_parameter(dtype="float32", shape=[10, 8])
+        mul_out = layers.mul(x=x, y=y)
+        mean_out = layers.mean(x=mul_out)
+        a = calc_gradient(mean_out, mul_out)
+        b = calc_gradient(mean_out, x)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        exe.run(fluid.default_main_program(), feed={}, fetch_list=[a, b])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_cast_op.py b/python/paddle/v2/fluid/tests/test_cast_op.py
index 0c4b6310652e84d3dd7f281a8b98ae0435072afb..3795b96dbf0f8f50ede2aeda7262ba61c095d6af 100644
--- a/python/paddle/v2/fluid/tests/test_cast_op.py
+++ b/python/paddle/v2/fluid/tests/test_cast_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import op_test
 import unittest
 import numpy as np
@@ -10,8 +23,8 @@ class TestCastOp(op_test.OpTest):
         self.inputs = {'X': ipt.astype('float32')}
         self.outputs = {'Out': ipt.astype('float64')}
         self.attrs = {
-            'in_data_type': int(core.DataType.FP32),
-            'out_data_type': int(core.DataType.FP64)
+            'in_dtype': int(core.DataType.FP32),
+            'out_dtype': int(core.DataType.FP64)
         }
         self.op_type = 'cast'
 
diff --git a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
index 48673296a67716c4de804da533f0fd2567f10e2e..59ef2bbb2feebc5930ff9513598e0ac55376635f 100644
--- a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
+++ b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -120,7 +133,7 @@ class TestChunkEvalOp(OpTest):
         self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 4, 5, 9
 
     def set_data(self):
-        infer = np.zeros((self.batch_size, )).astype('int32')
+        infer = np.zeros((self.batch_size, )).astype('int64')
         infer.fill(self.num_chunk_types * self.num_tag_types)
         label = np.copy(infer)
         starts = np.random.choice(
@@ -147,7 +160,13 @@ class TestChunkEvalOp(OpTest):
             'Recall': np.asarray(
                 [recall], dtype='float32'),
             'F1-Score': np.asarray(
-                [f1], dtype='float32')
+                [f1], dtype='float32'),
+            'NumInferChunks': np.asarray(
+                [self.num_infer_chunks], dtype='int64'),
+            'NumLabelChunks': np.asarray(
+                [self.num_label_chunks], dtype='int64'),
+            'NumCorrectChunks': np.asarray(
+                [self.num_correct_chunks], dtype='int64')
         }
 
     def setUp(self):
diff --git a/python/paddle/v2/fluid/tests/test_clip.py b/python/paddle/v2/fluid/tests/test_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..63353a10963532ec5b35eff22644adf4823243aa
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_clip.py
@@ -0,0 +1,80 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import print_function
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+BATCH_SIZE = 128
+CLIP_MAX = 2e-6
+CLIP_MIN = -1e-6
+
+prog = fluid.framework.Program()
+
+with fluid.program_guard(main_program=prog):
+    image = fluid.layers.data(name='x', shape=[784], dtype='float32')
+
+    hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
+    hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
+    predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
+
+    label = fluid.layers.data(name='y', shape=[1], dtype='int64')
+
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+prog_clip = prog.clone()
+prog_clip.block(0).var(hidden1.name).set_error_clip(
+    fluid.clip.ErrorClipByValue(
+        max=CLIP_MAX, min=CLIP_MIN))
+
+avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
+fluid.backward.append_backward(loss=avg_cost)
+fluid.backward.append_backward(
+    loss=avg_cost_clip, callback=fluid.clip.error_clip_callback)
+
+hidden1_grad = prog.block(0).var(hidden1.name + "@GRAD")
+hidden1_grad_clip = prog_clip.block(0).var(hidden1.name + "@GRAD")
+
+hidden2_grad = prog.block(0).var(hidden2.name + "@GRAD")
+hidden2_grad_clip = prog_clip.block(0).var(hidden2.name + "@GRAD")
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=8192),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
+exe.run(fluid.default_startup_program())
+
+count = 0
+for data in train_reader():
+    count += 1
+    if count > 5:
+        break
+    out1, out2 = exe.run(prog,
+                         feed=feeder.feed(data),
+                         fetch_list=[hidden1_grad, hidden2_grad])
+    out1_clip, out2_clip = exe.run(
+        prog_clip,
+        feed=feeder.feed(data),
+        fetch_list=[hidden1_grad_clip, hidden2_grad_clip])
+    if not ((out1.clip(
+            min=CLIP_MIN, max=CLIP_MAX) == out1_clip).all() and
+            (out2 == out2_clip).all()):
+        exit(1)
+
+exit(0)
diff --git a/python/paddle/v2/fluid/tests/test_clip_by_norm_op.py b/python/paddle/v2/fluid/tests/test_clip_by_norm_op.py
index 02f6108a3a661b0e32cd2e7ed65cb4b8cb50c067..5147e75046294cb8fc7673f2e58a4fb8e50e0e53 100644
--- a/python/paddle/v2/fluid/tests/test_clip_by_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_clip_by_norm_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_clip_op.py b/python/paddle/v2/fluid/tests/test_clip_op.py
index a7e1bf174408e4139db0435d9f4bb0c885f76705..3338dc61b38c7e46a048458ae81d7592e30bbffb 100644
--- a/python/paddle/v2/fluid/tests/test_clip_op.py
+++ b/python/paddle/v2/fluid/tests/test_clip_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_compare_op.py b/python/paddle/v2/fluid/tests/test_compare_op.py
index 5d0dfab6ffd1cbbbfbcdb3af60f1868b7b780456..fbf8921e40563cca28ade38465f68eb133b263e4 100644
--- a/python/paddle/v2/fluid/tests/test_compare_op.py
+++ b/python/paddle/v2/fluid/tests/test_compare_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import op_test
 import unittest
 import numpy
diff --git a/python/paddle/v2/fluid/tests/test_concat_op.py b/python/paddle/v2/fluid/tests/test_concat_op.py
index a792d1c106ac00efd92e680cfad67f41a7520e26..3e413e15404f64755fdf1e1db936ca34d61d2b03 100644
--- a/python/paddle/v2/fluid/tests/test_concat_op.py
+++ b/python/paddle/v2/fluid/tests/test_concat_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_cond_op.py b/python/paddle/v2/fluid/tests/test_cond_op.py
index 9d1df44b9065f8101e90b87815660f8c0818645f..5312fa51a253a7151107d508120cb590aa822364 100644
--- a/python/paddle/v2/fluid/tests/test_cond_op.py
+++ b/python/paddle/v2/fluid/tests/test_cond_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import logging
 import paddle.v2.fluid.core as core
 import unittest
@@ -65,8 +78,7 @@ class TestCondOp(unittest.TestCase):
         self.create_global_variables()
         self.create_cond_op()
         self.create_sub_net()
-        ctx = core.DeviceContext.create(core.CPUPlace())
-        self.condop.run(self.scope, ctx)
+        self.condop.run(self.scope, core.CPUPlace())
         return np.array(self.scope.find_var("Out").get_tensor())
 
     def create_global_variables(self):
diff --git a/python/paddle/v2/fluid/tests/test_conditional_block.py b/python/paddle/v2/fluid/tests/test_conditional_block.py
index 293803f004a1513611fba30634d5552e1da84fef..965e7d39c807265d5884dc10cce228c5d7f5823f 100644
--- a/python/paddle/v2/fluid/tests/test_conditional_block.py
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
@@ -1,15 +1,28 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
-from paddle.v2.fluid.framework import g_startup_program, g_main_program
+from paddle.v2.fluid.framework import default_startup_program, default_main_program
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy
 
 
 class ConditionalBlock(unittest.TestCase):
     def test_forward(self):
-        data = layers.data(name='X', shape=[1], data_type='float32')
+        data = layers.data(name='X', shape=[1], dtype='float32')
         data.stop_gradient = False
         cond = layers.ConditionalBlock(inputs=[data])
         out = layers.create_tensor(dtype='float32')
@@ -19,20 +32,19 @@ class ConditionalBlock(unittest.TestCase):
 
         cpu = core.CPUPlace()
         exe = Executor(cpu)
-        exe.run(g_startup_program)
+        exe.run(default_startup_program())
 
-        x = core.LoDTensor()
-        x.set(numpy.random.random(size=(10, 1)).astype('float32'), cpu)
+        x = numpy.random.random(size=(10, 1)).astype('float32')
 
-        outs = map(numpy.array, exe.run(feed={'X': x}, fetch_list=[out]))[0]
+        outs = exe.run(feed={'X': x}, fetch_list=[out])[0]
         print outs
         loss = layers.mean(x=out)
-        append_backward_ops(loss=loss)
-        outs = map(numpy.array,
-                   exe.run(feed={'X': x},
-                           fetch_list=[
-                               g_main_program.block(0).var(data.name + "@GRAD")
-                           ]))[0]
+        append_backward(loss=loss)
+        outs = exe.run(
+            feed={'X': x},
+            fetch_list=[
+                default_main_program().block(0).var(data.name + "@GRAD")
+            ])[0]
         print outs
 
 
diff --git a/python/paddle/v2/fluid/tests/test_const_value.py b/python/paddle/v2/fluid/tests/test_const_value.py
new file mode 100644
index 0000000000000000000000000000000000000000..190bfa779b4a2fac4d425ba9e45223d2aa04173b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_const_value.py
@@ -0,0 +1,27 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import paddle.v2.fluid.framework as framework
+
+
+class ConditionalBlock(unittest.TestCase):
+    def test_const_value(self):
+        self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD")
+        self.assertEqual(framework.TEMP_VAR_NAME, "@TEMP@")
+        self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD")
+        self.assertEqual(framework.ZERO_VAR_SUFFIX, "@ZERO")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py
index 2240dc73cdd31f320fed174dd811e93c6640137f..8b03a3ae16592888119e1c9ea797e7bbe8acd324 100644
--- a/python/paddle/v2/fluid/tests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
@@ -1,5 +1,20 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
+
+import paddle.v2.fluid.core as core
 from op_test import OpTest
 
 
@@ -16,8 +31,8 @@ def conv2d_forward_naive(input, filter, group, conv_param):
     out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) / stride[1]
     out = np.zeros((in_n, out_c, out_h, out_w))
 
-    d_bolck_w = (dilation[0] * (f_h - 1) + 1)
-    d_bolck_h = (dilation[1] * (f_w - 1) + 1)
+    d_bolck_h = (dilation[0] * (f_h - 1) + 1)
+    d_bolck_w = (dilation[1] * (f_w - 1) + 1)
 
     input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], )),
                        mode='constant',
@@ -47,6 +62,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
 
 class TestConv2dOp(OpTest):
     def setUp(self):
+        self.use_cudnn = False
         self.init_op_type()
         self.init_group()
         self.init_dilation()
@@ -67,30 +83,59 @@ class TestConv2dOp(OpTest):
             'strides': self.stride,
             'paddings': self.pad,
             'groups': self.groups,
-            'dilations': self.dilations
+            'dilations': self.dilations,
+            'use_cudnn': self.use_cudnn
         }
         self.outputs = {'Output': output}
 
     def test_check_output(self):
-        self.check_output()
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-5)
+        else:
+            self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(
-            set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                set(['Input', 'Filter']),
+                'Output',
+                max_relative_error=0.02)
+        else:
+            self.check_grad(
+                set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
 
     def test_check_grad_no_filter(self):
-        self.check_grad(
-            ['Input'],
-            'Output',
-            max_relative_error=0.02,
-            no_grad_set=set(['Filter']))
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['Input'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Filter']))
+        else:
+            self.check_grad(
+                ['Input'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
-        self.check_grad(
-            ['Filter'],
-            'Output',
-            max_relative_error=0.02,
-            no_grad_set=set(['Input']))
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['Filter'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Input']))
+        else:
+            self.check_grad(
+                ['Filter'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Input']))
 
     def init_test_case(self):
         self.pad = [0, 0]
@@ -164,34 +209,39 @@ class TestWithDilation(TestConv2dOp):
         self.groups = 3
 
 
-#----------------Conv2dCudnn----------------
-class TestCudnn(TestConv2dOp):
+#----------------Conv2dCUDNN----------------
+class TestCUDNN(TestConv2dOp):
     def init_op_type(self):
-        self.op_type = "conv_cudnn"
+        self.use_cudnn = True
+        self.op_type = "conv2d"
 
 
-class TestCudnnWithPad(TestWithPad):
+class TestCUDNNWithPad(TestWithPad):
     def init_op_type(self):
-        self.op_type = "conv_cudnn"
+        self.use_cudnn = True
+        self.op_type = "conv2d"
 
 
-class TestCudnnWithStride(TestWithStride):
+class TestCUDNNWithStride(TestWithStride):
     def init_op_type(self):
-        self.op_type = "conv_cudnn"
+        self.use_cudnn = True
+        self.op_type = "conv2d"
 
 
-class TestCudnnWithGroup(TestWithGroup):
+class TestCUDNNWithGroup(TestWithGroup):
     def init_op_type(self):
-        self.op_type = "conv_cudnn"
+        self.use_cudnn = True
+        self.op_type = "conv2d"
 
 
-class TestCudnnWith1x1(TestWith1x1):
+class TestCUDNNWith1x1(TestWith1x1):
     def init_op_type(self):
-        self.op_type = "conv_cudnn"
+        self.use_cudnn = True
+        self.op_type = "conv2d"
 
 
 #  cudnn v5 does not support dilation conv.
-# class TestCudnnWithDilation(TestWithDilation):
+# class TestCUDNNWithDilation(TestWithDilation):
 #     def init_op_type(self):
 #         self.op_type = "conv_cudnn"
 
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
index d7b1f2f2a3abf6335998742dbbef8e17794170fa..b7b86c58fb81d44b153c3b6724d5ebd524db55e3 100644
--- a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
@@ -1,16 +1,34 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
+
+import paddle.v2.fluid.core as core
 from op_test import OpTest
 
 
-def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
+def conv2dtranspose_forward_naive(input_, filter_, attrs):
     in_n, in_c, in_h, in_w = input_.shape
     f_c, out_c, f_h, f_w = filter_.shape
     assert in_c == f_c
 
-    stride, pad = conv2dtranspose_param['stride'], conv2dtranspose_param['pad']
-    out_h = (in_h - 1) * stride[0] + f_h
-    out_w = (in_w - 1) * stride[1] + f_w
+    stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
+        'dilations']
+    d_bolck_h = dilations[0] * (f_h - 1) + 1
+    d_bolck_w = dilations[1] * (f_w - 1) + 1
+    out_h = (in_h - 1) * stride[0] + d_bolck_h
+    out_w = (in_w - 1) * stride[1] + d_bolck_w
 
     out = np.zeros((in_n, out_c, out_h, out_w))
 
@@ -23,9 +41,9 @@ def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
 
                 for k in range(out_c):
                     tmp_out = np.sum(input_masked * filter_[:, k, :, :], axis=0)
-                    i1, i2 = i * stride[0], i * stride[0] + f_h
-                    j1, j2 = j * stride[0], j * stride[0] + f_w
-                    out[n, k, i1:i2, j1:j2] += tmp_out
+                    i1, i2 = i * stride[0], i * stride[0] + d_bolck_h
+                    j1, j2 = j * stride[0], j * stride[0] + d_bolck_h
+                    out[n, k, i1:i2:dilations[0], j1:j2:dilations[1]] += tmp_out
 
     out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]]
     return out
@@ -34,43 +52,75 @@ def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
 class TestConv2dTransposeOp(OpTest):
     def setUp(self):
         # init as conv transpose
+        self.use_cudnn = False
         self.init_op_type()
         self.init_test_case()
 
-        conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad}
         input_ = np.random.random(self.input_size).astype("float32")
         filter_ = np.random.random(self.filter_size).astype("float32")
-        output = conv2dtranspose_forward_naive(
-            input_, filter_, conv2dtranspose_param).astype('float32')
 
         self.inputs = {'Input': input_, 'Filter': filter_}
         self.attrs = {
             'strides': self.stride,
             'paddings': self.pad,
-            'dilations': self.dilations
+            'dilations': self.dilations,
+            'use_cudnn': self.use_cudnn,
+            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
         }
+
+        output = conv2dtranspose_forward_naive(input_, filter_,
+                                               self.attrs).astype('float32')
+
         self.outputs = {'Output': output}
 
     def test_check_output(self):
-        self.check_output()
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-5)
+        else:
+            self.check_output()
 
     def test_check_grad_no_input(self):
-        self.check_grad(
-            ['Filter'],
-            'Output',
-            max_relative_error=0.02,
-            no_grad_set=set(['Input']))
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['Filter'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Input']))
+        else:
+            self.check_grad(
+                ['Filter'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Input']))
 
     def test_check_grad_no_filter(self):
-        self.check_grad(
-            ['Input'],
-            'Output',
-            max_relative_error=0.02,
-            no_grad_set=set(['Filter']))
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['Input'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Filter']))
+        else:
+            self.check_grad(
+                ['Input'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Filter']))
 
     def test_check_grad(self):
-        self.check_grad(
-            set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                set(['Input', 'Filter']),
+                'Output',
+                max_relative_error=0.02)
+        else:
+            self.check_grad(
+                set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
 
     def init_test_case(self):
         self.pad = [0, 0]
@@ -104,11 +154,63 @@ class TestWithStride(TestConv2dTransposeOp):
         self.filter_size = [f_c, 6, 3, 3]
 
 
+class TestWithDilation(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+
 # ------------ test_cudnn ------------
-class TestCudnn(TestConv2dTransposeOp):
+class TestCUDNN(TestConv2dTransposeOp):
     def init_op_type(self):
-        self.op_type = "conv2d_transpose_cudnn"
+        self.use_cudnn = True
+        self.op_type = "conv2d_transpose"
+
+
+class TestCUDNNWithPad(TestWithPad):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv2d_transpose"
+
+
+class TestCUDNNWithStride(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv2d_transpose"
+
 
+# #cudnn v5 does not support dilation conv.
+# class TestCUDNNWithDilation(TestWithDilation):
+#     def init_test_case(self):
+#         self.pad = [1, 1]
+#         self.stride = [2, 2]
+#         self.dilations = [2, 2]
+#         self.input_size = [2, 3, 5, 5]  # NCHW
+#         f_c = self.input_size[1]
+#         self.filter_size = [f_c, 6, 3, 3]
+#
+#     def init_op_type(self):
+#         self.op_type = "conv2d_transpose"
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conv3d_op.py b/python/paddle/v2/fluid/tests/test_conv3d_op.py
index 934ea46437d67b78309a86a2779e0c6577399136..5b0397cc690caccdc78317071a10eb451afda979 100644
--- a/python/paddle/v2/fluid/tests/test_conv3d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv3d_op.py
@@ -1,5 +1,20 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
+
+import paddle.v2.fluid.core as core
 from op_test import OpTest
 
 
@@ -54,6 +69,7 @@ def conv3d_forward_naive(input, filter, group, conv_param):
 
 class TestConv3dOp(OpTest):
     def setUp(self):
+        self.use_cudnn = False
         self.init_group()
         self.init_op_type()
         self.init_dilation()
@@ -62,7 +78,9 @@ class TestConv3dOp(OpTest):
         conv3d_param = {
             'stride': self.stride,
             'pad': self.pad,
-            'dilations': self.dilations
+            'dilations': self.dilations,
+            'use_cudnn': self.use_cudnn,
+            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
         }
         input = np.random.random(self.input_size).astype("float32")
         filter = np.random.random(self.filter_size).astype("float32")
@@ -79,25 +97,53 @@ class TestConv3dOp(OpTest):
         self.outputs = {'Output': output}
 
     def test_check_output(self):
-        self.check_output()
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-5)
+        else:
+            self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(
-            set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                set(['Input', 'Filter']),
+                'Output',
+                max_relative_error=0.03)
+        else:
+            self.check_grad(
+                set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
 
     def test_check_grad_no_filter(self):
-        self.check_grad(
-            ['Input'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Filter']))
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['Input'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Filter']))
+        else:
+            self.check_grad(
+                ['Input'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
-        self.check_grad(
-            ['Filter'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Input']))
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['Filter'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Input']))
+        else:
+            self.check_grad(
+                ['Filter'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Input']))
 
     def init_test_case(self):
         self.pad = [0, 0, 0]
@@ -169,5 +215,35 @@ class TestWithDilation(TestConv3dOp):
         self.groups = 3
 
 
+class TestCUDNN(TestConv3dOp):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d"
+
+
+class TestWithGroup1CUDNN(TestWithGroup1):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d"
+
+
+class TestWithGroup2CUDNN(TestWithGroup2):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d"
+
+
+class TestWith1x1CUDNN(TestWith1x1):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d"
+
+
+# FIXME(typhoonzero): find a way to determine if
+# using cudnn > 6 in python
+# class TestWithDilationCUDNN(TestWithDilation):
+#     def init_op_type(self):
+#         self.op_type = "conv3d"
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
index 8fd34b87bfea91307f52fdcbb9f71f2e1a9c6c56..b08969062a8ccfec063d7bc1ae3af38f5776fce7 100644
--- a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
@@ -1,17 +1,37 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
+
+import paddle.v2.fluid.core as core
 from op_test import OpTest
 
 
-def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
+def conv3dtranspose_forward_naive(input_, filter_, attrs):
     in_n, in_c, in_d, in_h, in_w = input_.shape
     f_c, out_c, f_d, f_h, f_w = filter_.shape
     assert in_c == f_c
 
-    stride, pad = conv3dtranspose_param['stride'], conv3dtranspose_param['pad']
-    out_d = (in_d - 1) * stride[0] + f_d
-    out_h = (in_h - 1) * stride[1] + f_h
-    out_w = (in_w - 1) * stride[2] + f_w
+    stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
+        'dilations']
+
+    d_bolck_d = dilations[0] * (f_d - 1) + 1
+    d_bolck_h = dilations[1] * (f_h - 1) + 1
+    d_bolck_w = dilations[2] * (f_w - 1) + 1
+    out_d = (in_d - 1) * stride[0] + d_bolck_d
+    out_h = (in_h - 1) * stride[1] + d_bolck_h
+    out_w = (in_w - 1) * stride[2] + d_bolck_w
     out = np.zeros((in_n, out_c, out_d, out_h, out_w))
 
     for n in range(in_n):
@@ -25,10 +45,11 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
                     for k in range(out_c):
                         tmp_out = np.sum(input_masked * filter_[:, k, :, :, :],
                                          axis=0)
-                        d1, d2 = d * stride[0], d * stride[0] + f_d
-                        i1, i2 = i * stride[1], i * stride[1] + f_h
-                        j1, j2 = j * stride[2], j * stride[2] + f_w
-                        out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out
+                        d1, d2 = d * stride[0], d * stride[0] + d_bolck_d
+                        i1, i2 = i * stride[1], i * stride[1] + d_bolck_h
+                        j1, j2 = j * stride[2], j * stride[2] + d_bolck_w
+                        out[n, k, d1:d2:dilations[0], i1:i2:dilations[1], j1:j2:
+                            dilations[2]] += tmp_out
 
     out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w -
               pad[2]]
@@ -38,43 +59,75 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
 class TestConv3dTransposeOp(OpTest):
     def setUp(self):
         # init as conv transpose
+        self.use_cudnn = False
         self.init_op_type()
         self.init_test_case()
 
-        conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad}
         input_ = np.random.random(self.input_size).astype("float32")
         filter_ = np.random.random(self.filter_size).astype("float32")
-        output = conv3dtranspose_forward_naive(
-            input_, filter_, conv3dtranspose_param).astype("float32")
 
         self.inputs = {'Input': input_, 'Filter': filter_}
         self.attrs = {
             'strides': self.stride,
             'paddings': self.pad,
-            # 'dilations': self.dilations
+            'dilations': self.dilations,
+            'use_cudnn': self.use_cudnn,
+            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
         }
+
+        output = conv3dtranspose_forward_naive(input_, filter_,
+                                               self.attrs).astype("float32")
+
         self.outputs = {'Output': output}
 
     def test_check_output(self):
-        self.check_output()
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-5)
+        else:
+            self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(
-            set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                set(['Input', 'Filter']),
+                'Output',
+                max_relative_error=0.03)
+        else:
+            self.check_grad(
+                set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
 
     def test_check_grad_no_filter(self):
-        self.check_grad(
-            ['Input'],
-            'Output',
-            max_relative_error=0.02,
-            no_grad_set=set(['Filter']))
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['Input'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Filter']))
+        else:
+            self.check_grad(
+                ['Input'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
-        self.check_grad(
-            ['Filter'],
-            'Output',
-            max_relative_error=0.02,
-            no_grad_set=set(['Input']))
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['Filter'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Input']))
+        else:
+            self.check_grad(
+                ['Filter'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Input']))
 
     def init_test_case(self):
         self.pad = [0, 0, 0]
@@ -108,11 +161,63 @@ class TestWithStride(TestConv3dTransposeOp):
         self.filter_size = [f_c, 6, 3, 3, 3]
 
 
+class TestWithDilation(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [2, 2, 2]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
 # ------------ test_cudnn ------------
-class TestCudnn(TestConv3dTransposeOp):
+class TestCUDNN(TestConv3dTransposeOp):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+
+
+class TestCUDNNWithPad(TestWithPad):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
     def init_op_type(self):
-        self.op_type = "conv3d_transpose_cudnn"
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+
+
+class TestCUDNNWithStride(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+
 
+# #cudnn v5 does not support dilation conv.
+# class TestCUDNNWithDilation(TestWithDilation):
+#     def init_test_case(self):
+#         self.pad = [1, 1, 1]
+#         self.stride = [2, 2, 2]
+#         self.dilations = [2, 2, 2]
+#         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+#         f_c = self.input_size[1]
+#         self.filter_size = [f_c, 6, 3, 3, 3]
+#
+#     def init_op_type(self):
+#         self.op_type = "conv3d_transpose"
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conv_shift_op.py b/python/paddle/v2/fluid/tests/test_conv_shift_op.py
index b9ab21a06a1c6e8e2d1e936a0b4b8a07a59f57b9..14b2640e24cb8bd51111f14c187be73e423d20d2 100644
--- a/python/paddle/v2/fluid/tests/test_conv_shift_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv_shift_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_cos_sim_op.py b/python/paddle/v2/fluid/tests/test_cos_sim_op.py
index 47557ccb41d1e835b5d04d1b94f54dfc7aa2855a..f6e5e2cbe9ed54d197b6ee640b4ffd1b3c101f99 100644
--- a/python/paddle/v2/fluid/tests/test_cos_sim_op.py
+++ b/python/paddle/v2/fluid/tests/test_cos_sim_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_create_op_doc_string.py b/python/paddle/v2/fluid/tests/test_create_op_doc_string.py
index 42b6f7a3616bbce53a8cae68a5fc1eda411a7422..6c922642210fd66604a66d1dd520fa5783c522c3 100644
--- a/python/paddle/v2/fluid/tests/test_create_op_doc_string.py
+++ b/python/paddle/v2/fluid/tests/test_create_op_doc_string.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import paddle.v2.fluid.layers as layers
 
diff --git a/python/paddle/v2/fluid/tests/test_crf_decoding_op.py b/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
index ee2b996bf430d5a0edaa0de459a937adffd9f8f6..40e80a824a2fdd4ec2a6f2d96a2a6bf14ab74be0 100644
--- a/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
+++ b/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import random
 import numpy as np
@@ -20,14 +33,14 @@ class CRFDecoding(object):
         self.w = transition_weights[2:, :]
 
         self.track = np.zeros(
-            (seq_start_positions[-1], self.tag_num), dtype="int32")
+            (seq_start_positions[-1], self.tag_num), dtype="int64")
         self.decoded_path = np.zeros(
-            (seq_start_positions[-1], 1), dtype="int32")
+            (seq_start_positions[-1], 1), dtype="int64")
 
     def _decode_one_sequence(self, decoded_path, x):
         seq_len, tag_num = x.shape
         alpha = np.zeros((seq_len, tag_num), dtype="float64")
-        track = np.zeros((seq_len, tag_num), dtype="int32")
+        track = np.zeros((seq_len, tag_num), dtype="int64")
 
         for i in range(tag_num):
             alpha[0, i] = self.a[i] + x[0, i]
@@ -125,10 +138,10 @@ class TestCRFDecodingOp2(OpTest):
             axis=0)
 
         labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int32")
+            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
         predicted_labels = np.ones(
-            (lod[-1][-1], 1), dtype="int32") * (TAG_NUM - 1)
-        expected_output = (labels == predicted_labels).astype("int32")
+            (lod[-1][-1], 1), dtype="int64") * (TAG_NUM - 1)
+        expected_output = (labels == predicted_labels).astype("int64")
 
         self.inputs = {
             "Emission": (emission, lod),
diff --git a/python/paddle/v2/fluid/tests/test_crop_op.py b/python/paddle/v2/fluid/tests/test_crop_op.py
index 62c883bdc130021d06c33ded9c2865505da0b719..a0b2fc954dda10ad264fecaf8827732b8820c71b 100644
--- a/python/paddle/v2/fluid/tests/test_crop_op.py
+++ b/python/paddle/v2/fluid/tests/test_crop_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_cross_entropy_op.py b/python/paddle/v2/fluid/tests/test_cross_entropy_op.py
index b81af9364d63bc9b242372e71f175ad047d7c240..f05e6b235656c75cb75c0ae6a8758ceb48a54352 100644
--- a/python/paddle/v2/fluid/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/fluid/tests/test_cross_entropy_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest, randomize_probability
diff --git a/python/paddle/v2/fluid/tests/test_data_feeder.py b/python/paddle/v2/fluid/tests/test_data_feeder.py
new file mode 100644
index 0000000000000000000000000000000000000000..5574766f8fad2272721f515311190c3dc41ace85
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_data_feeder.py
@@ -0,0 +1,26 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import paddle.v2.fluid as fluid
+
+
+def test_converter():
+    img = fluid.layers.data(name='image', shape=[1, 28, 28])
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
+    result = feeder.feed([[[0] * 784, [9]], [[1] * 784, [1]]])
+    print(result)
+
+
+if __name__ == '__main__':
+    test_converter()
diff --git a/python/paddle/v2/fluid/tests/test_decayed_adagrad_op.py b/python/paddle/v2/fluid/tests/test_decayed_adagrad_op.py
index 674c3fda5c82309bbfbbad936a8b0b26929d42d9..5e745a284316e9be512178f909399ca4c7708901 100644
--- a/python/paddle/v2/fluid/tests/test_decayed_adagrad_op.py
+++ b/python/paddle/v2/fluid/tests/test_decayed_adagrad_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_default_scope_funcs.py b/python/paddle/v2/fluid/tests/test_default_scope_funcs.py
index 738e69529ea447e87516d5e0efc098910b966ded..7a62168be90514b933f0a8a58e754609ecaad646 100644
--- a/python/paddle/v2/fluid/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/fluid/tests/test_default_scope_funcs.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.v2.fluid.default_scope_funcs import *
 import unittest
 
diff --git a/python/paddle/v2/fluid/tests/test_detection_output_op.py b/python/paddle/v2/fluid/tests/test_detection_output_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..147a43628c671658a5d4338da99423eda851b195
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_detection_output_op.py
@@ -0,0 +1,70 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestUnpoolOp(OpTest):
+    def setUp(self):
+        self.op_type = "detection_output"
+        self.init_test_case()
+
+        #loc.shape ((1, 4, 4, 1, 1))
+        #conf.shape ((1, 4, 2, 1, 1))
+
+        loc = np.array([[[[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
+                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
+                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
+                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]]]])
+        conf = np.array([[[[[0.1]], [[0.9]]], [[[0.2]], [[0.8]]],
+                          [[[0.3]], [[0.7]]], [[[0.4]], [[0.6]]]]])
+        priorbox = np.array([
+            0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.6, 0.6, 0.1,
+            0.1, 0.2, 0.2, 0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2, 0.4, 0.4,
+            0.8, 0.8, 0.1, 0.1, 0.2, 0.2
+        ])
+
+        output = np.array([
+            0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031
+        ])
+        self.inputs = {
+            'Loc': loc.astype('float32'),
+            'Conf': conf.astype('float32'),
+            'PriorBox': priorbox.astype('float32')
+        }
+        self.attrs = {
+            'num_classes': self.num_classes,
+            'top_k': self.top_k,
+            'nms_top_k': self.nms_top_k,
+            'background_label_id': self.background_label_id,
+            'nms_threshold': self.nms_threshold,
+            'confidence_threshold': self.confidence_threshold,
+        }
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_test_case(self):
+        self.num_classes = 2
+        self.top_k = 10
+        self.nms_top_k = 20
+        self.background_label_id = 0
+        self.nms_threshold = 0.01
+        self.confidence_threshold = 0.01
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_dropout_op.py b/python/paddle/v2/fluid/tests/test_dropout_op.py
index b14a366fcad7f4bf6968b6013c6cfbb57090071d..f401050dcc39d4bf786e33ef3e8a4c33e0250044 100644
--- a/python/paddle/v2/fluid/tests/test_dropout_op.py
+++ b/python/paddle/v2/fluid/tests/test_dropout_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -7,7 +20,7 @@ class TestDropoutOp(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_training': True}
+        self.attrs = {'dropout_prob': 0.0, 'is_test': False}
         self.outputs = {
             'Out': self.inputs['X'],
             'Mask': np.ones((32, 64)).astype('float32')
@@ -24,7 +37,7 @@ class TestDropoutOp2(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 1.0, 'is_training': True}
+        self.attrs = {'dropout_prob': 1.0, 'is_test': False}
         self.outputs = {
             'Out': np.zeros((32, 64)).astype('float32'),
             'Mask': np.zeros((32, 64)).astype('float32')
@@ -35,7 +48,7 @@ class TestDropoutOp3(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_training': True}
+        self.attrs = {'dropout_prob': 0.0, 'is_test': False}
         self.outputs = {
             'Out': self.inputs['X'],
             'Mask': np.ones((32, 64, 2)).astype('float32')
@@ -46,8 +59,10 @@ class TestDropoutOp4(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.35, 'is_training': False}
-        self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
+        self.attrs = {'dropout_prob': 0.35, 'is_test': True}
+        self.outputs = {
+            'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -57,8 +72,10 @@ class TestDropoutOp5(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.75, 'is_training': False}
-        self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
+        self.attrs = {'dropout_prob': 0.75, 'is_test': True}
+        self.outputs = {
+            'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
+        }
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/fluid/tests/test_dyn_rnn.py b/python/paddle/v2/fluid/tests/test_dyn_rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a946fea58d67887cac40bd35b14e23752d6a1619
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_dyn_rnn.py
@@ -0,0 +1,142 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import paddle.v2.fluid as fluid
+import paddle.v2 as paddle
+import unittest
+import numpy
+
+
+class TestDynRNN(unittest.TestCase):
+    def setUp(self):
+        self.word_dict = paddle.dataset.imdb.word_dict()
+        self.BATCH_SIZE = 2
+        self.train_data = paddle.batch(
+            paddle.dataset.imdb.train(self.word_dict),
+            batch_size=self.BATCH_SIZE)
+
+    def test_plain_while_op(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+
+        with fluid.program_guard(main_program, startup_program):
+            sentence = fluid.layers.data(
+                name='word', shape=[1], dtype='int64', lod_level=1)
+            sent_emb = fluid.layers.embedding(
+                input=sentence, size=[len(self.word_dict), 32], dtype='float32')
+
+            label = fluid.layers.data(name='label', shape=[1], dtype='float32')
+
+            rank_table = fluid.layers.lod_rank_table(x=sent_emb)
+
+            sent_emb_array = fluid.layers.lod_tensor_to_array(
+                x=sent_emb, table=rank_table)
+
+            seq_len = fluid.layers.max_sequence_len(rank_table=rank_table)
+            i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
+            i.stop_gradient = False
+
+            boot_mem = fluid.layers.fill_constant_batch_size_like(
+                input=fluid.layers.array_read(
+                    array=sent_emb_array, i=i),
+                value=0,
+                shape=[-1, 100],
+                dtype='float32')
+            boot_mem.stop_gradient = False
+
+            mem_array = fluid.layers.array_write(x=boot_mem, i=i)
+
+            cond = fluid.layers.less_than(x=i, y=seq_len)
+            cond.stop_gradient = False
+            while_op = fluid.layers.While(cond=cond)
+            out = fluid.layers.create_array(dtype='float32')
+
+            with while_op.block():
+                mem = fluid.layers.array_read(array=mem_array, i=i)
+                ipt = fluid.layers.array_read(array=sent_emb_array, i=i)
+
+                mem = fluid.layers.shrink_memory(x=mem, i=i, table=rank_table)
+
+                hidden = fluid.layers.fc(input=[mem, ipt], size=100, act='tanh')
+
+                fluid.layers.array_write(x=hidden, i=i, array=out)
+                fluid.layers.increment(x=i, in_place=True)
+                fluid.layers.array_write(x=hidden, i=i, array=mem_array)
+                fluid.layers.less_than(x=i, y=seq_len, cond=cond)
+
+            all_timesteps = fluid.layers.array_to_lod_tensor(
+                x=out, table=rank_table)
+            last = fluid.layers.sequence_last_step(input=all_timesteps)
+            logits = fluid.layers.fc(input=last, size=1, act=None)
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+                x=logits, label=label)
+            loss = fluid.layers.mean(x=loss)
+            sgd = fluid.optimizer.SGD(1e-4)
+            sgd.minimize(loss=loss)
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        exe.run(startup_program)
+        feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu)
+
+        data = next(self.train_data())
+        val = exe.run(main_program, feed=feeder.feed(data),
+                      fetch_list=[loss])[0]
+        self.assertEqual((1, ), val.shape)
+        print(val)
+        self.assertFalse(numpy.isnan(val))
+
+    def test_train_dyn_rnn(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            sentence = fluid.layers.data(
+                name='word', shape=[1], dtype='int64', lod_level=1)
+            sent_emb = fluid.layers.embedding(
+                input=sentence, size=[len(self.word_dict), 32], dtype='float32')
+
+            rnn = fluid.layers.DynamicRNN()
+
+            with rnn.block():
+                in_ = rnn.step_input(sent_emb)
+                mem = rnn.memory(shape=[100], dtype='float32')
+                out_ = fluid.layers.fc(input=[in_, mem], size=100, act='tanh')
+                rnn.update_memory(mem, out_)
+                rnn.output(out_)
+
+            last = fluid.layers.sequence_last_step(input=rnn())
+            logits = fluid.layers.fc(input=last, size=1, act=None)
+            label = fluid.layers.data(name='label', shape=[1], dtype='float32')
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+                x=logits, label=label)
+            loss = fluid.layers.mean(x=loss)
+            sgd = fluid.optimizer.Adam(1e-3)
+            sgd.minimize(loss=loss)
+
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        exe.run(startup_program)
+        feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu)
+        data = next(self.train_data())
+        loss_0 = exe.run(main_program,
+                         feed=feeder.feed(data),
+                         fetch_list=[loss])[0]
+        for _ in xrange(100):
+            val = exe.run(main_program,
+                          feed=feeder.feed(data),
+                          fetch_list=[loss])[0]
+        # loss should be small after 100 mini-batch
+        self.assertLess(val[0], loss_0[0])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py b/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py
deleted file mode 100644
index c2d8b48ea944ae40a451492b8e9fad38dda0835c..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import logging
-import paddle.v2.fluid.core as core
-import unittest
-from paddle.v2.fluid.op import Operator, DynamicRecurrentOp
-import numpy as np
-
-# for siplicity, just one level LoD
-lod_py = [[0, 4, 7, 9, 10]]
-input_dim = 30
-num_sents = len(lod_py[0]) - 1
-weight_dim = 15
-
-
-def create_tensor(scope, name, shape, np_data):
-    tensor = scope.var(name).get_tensor()
-    tensor.set_dims(shape)
-    tensor.set(np_data, core.CPUPlace())
-    return tensor
-
-
-class PyRNNStep(object):
-    def __init__(self):
-
-        self.x = np.random.normal(size=(lod_py[0][-1],
-                                        input_dim)).astype("float32")
-        self.W = np.random.normal(size=(input_dim, input_dim)).astype("float32")
-        self.U = np.random.normal(size=(input_dim, input_dim)).astype("float32")
-        self.h_boot = np.random.normal(size=(num_sents,
-                                             input_dim)).astype("float32")
-
-
-class DynamicRecurrentOpTest(unittest.TestCase):
-    '''
-    Test RNNOp
-
-    equation:
-        h_t = \sigma (W x_t + U h_{t-1})
-    weights:
-        - W
-        - U
-    vars:
-        - x
-    states:
-        - h
-    outputs:
-       - h
-    '''
-
-    py = PyRNNStep()
-
-    def forward(self):
-        self.scope = core.Scope()
-        self.create_global_variables()
-        self.create_rnn_op()
-        self.create_step_net()
-        ctx = core.DeviceContext.create(core.CPUPlace())
-        self.rnnop.run(self.scope, ctx)
-        state = self.rnnop.get_state("h@state")
-        print 'state size: ', state.size()
-
-        step_inputs = self.rnnop.get_step_input("x")
-        print "x size ", step_inputs.size()
-        for i in range(step_inputs.size()):
-            print "x %d" % i, np.array(step_inputs.read(i).get_dims())
-        step_outputs = self.rnnop.get_step_output('h@state')
-        print 'step_outputs.size ', step_outputs.size()
-        output = self.scope.find_var("h@state").get_tensor()
-        print 'output', np.array(output).shape
-
-    def create_global_variables(self):
-        # create inlink
-        x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim],
-                                 self.py.x)
-        x_tensor.set_lod(lod_py)
-        create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W)
-        create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U)
-        create_tensor(self.scope, "h_boot", [num_sents, input_dim],
-                      self.py.h_boot)
-        self.scope.var("step_scopes")
-        self.scope.var("h@state")
-
-    def create_rnn_op(self):
-        # create RNNOp
-        self.rnnop = DynamicRecurrentOp(
-            # inputs
-            inputs=["x"],
-            initial_states=["h_boot"],
-            step_net="step_unit",
-            # outputs
-            outputs=["h@state"],
-            step_scopes="step_scopes",
-            # attributes
-            ex_states=["h@pre"],
-            states=["h@state"])
-
-    def create_step_net(self):
-        step_unit = core.Net.create()
-        x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
-        h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
-        sig_op = Operator("sigmoid", X="sum", Y="h@state")
-
-        for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
-            step_unit.append_op(op)
-        step_unit.complete_add_op(True)
-        self.rnnop.set_step_unit(step_unit)
-
-    def test_forward(self):
-        print 'test recurrent op forward'
-        pd_output = self.forward()
-        print 'pd_output', pd_output
-
-
-class RecurrentGradientOpTest(unittest.TestCase):
-    py = PyRNNStep()
-
-    def create_forward_op(self):
-        # create RNNOp
-        self.forward_op = DynamicRecurrentOp(
-            # inputs
-            inputs=["x"],
-            initial_states=["h_boot"],
-            step_net="step_unit",
-            # outputs
-            outputs=["h@state"],
-            step_scopes="step_scopes",
-            # attributes
-            ex_states=["h@pre"],
-            states=["h@state"])
-
-    def create_gradient_op(self):
-        a = set()
-        backward_op = core.DynamicRecurrentOp.backward(self.forward_op, a)
-
-    def create_step_net(self):
-        step_unit = core.Net.create()
-        x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
-        h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
-        sig_op = Operator("sigmoid", X="sum", Y="h@state")
-
-        for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
-            step_unit.append_op(op)
-        step_unit.complete_add_op(True)
-        self.forward_op.set_step_unit(step_unit)
-
-    def create_global_variables(self):
-        # create inlink
-        x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim],
-                                 self.py.x)
-        x_tensor.set_lod(lod_py)
-        create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W)
-        create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U)
-        create_tensor(self.scope, "h_boot", [num_sents, input_dim],
-                      self.py.h_boot)
-        self.scope.var("step_scopes")
-        self.scope.var("h@state")
-
-    def test_grad(self):
-        self.scope = core.Scope()
-        self.create_forward_op()
-        self.create_global_variables()
-        self.create_step_net()
-        self.create_gradient_op()
-
-
-if __name__ == '__main__':
-    exit(
-        0
-    )  # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
-    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..95cc80739d6940612d303a0033c74aa91a7465cc
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py
@@ -0,0 +1,377 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import numpy
+import random
+import collections
+import paddle.v2.fluid as fluid
+import unittest
+from decorators import *
+
+
+class Memory(object):
+    def __init__(self, shape, dtype='float32'):
+        self.ex = numpy.zeros(shape=shape, dtype=dtype)
+        self.cur = None
+
+    def update(self, val):
+        assert val.shape == self.ex.shape
+        assert val.dtype == self.ex.dtype
+        self.cur = val
+
+    def ex(self):
+        return self.ex
+
+    def next(self):
+        self.ex = self.cur
+        self.cur = None
+
+    def __next__(self):
+        self.next()
+
+    def reset(self):
+        self.ex = numpy.zeros(shape=self.ex.shape, dtype=self.ex.dtype)
+        self.cur = None
+
+
+class Output(object):
+    def __init__(self):
+        self.outs = []
+
+    def next_sequence(self):
+        self.outs.append([])
+
+    def out(self, val):
+        self.outs[-1].append(val)
+
+    def last(self):
+        return self.outs[-1][-1]
+
+
+class BaseRNN(object):
+    def __init__(self, ins, mems, params, outs, num_seq=5, max_seq_len=15):
+        self.num_seq = num_seq
+        self.inputs = collections.defaultdict(list)
+
+        for _ in xrange(num_seq):
+            seq_len = random.randint(1, max_seq_len - 1)
+            for iname in ins:
+                ishape = ins[iname].get('shape', None)
+                idtype = ins[iname].get('dtype', 'float32')
+                lst = []
+                for _ in xrange(seq_len):
+                    lst.append(numpy.random.random(size=ishape).astype(idtype))
+                self.inputs[iname].append(lst)
+
+        self.mems = dict()
+        for mname in mems:
+            mshape = mems[mname].get('shape', None)
+            mdtype = mems[mname].get('dtype', 'float32')
+            self.mems[mname] = Memory(shape=mshape, dtype=mdtype)
+
+        self.params = dict()
+        for pname in params:
+            pshape = params[pname].get('shape', None)
+            pdtype = params[pname].get('dtype', 'float32')
+            self.params[pname] = numpy.random.random(size=pshape).astype(pdtype)
+
+        self.outputs = dict()
+
+        for oname in outs:
+            self.outputs[oname] = Output()
+
+    def step(self, **kwargs):
+        raise NotImplementedError()
+
+    def exe(self):
+        retv = dict()
+        for out in self.outputs:
+            retv[out] = []
+
+        for seq_id in xrange(self.num_seq):
+            for mname in self.mems:
+                self.mems[mname].reset()
+            for out in self.outputs:
+                self.outputs[out].next_sequence()
+
+            iname0 = self.inputs.keys()[0]
+            seq_len = len(self.inputs[iname0][seq_id])
+
+            for step_id in xrange(seq_len):
+                xargs = dict()
+
+                for iname in self.inputs:
+                    xargs[iname] = self.inputs[iname][seq_id][step_id]
+
+                for mname in self.mems:
+                    xargs[mname] = self.mems[mname]
+
+                for pname in self.params:
+                    xargs[pname] = self.params[pname]
+
+                for out in self.outputs:
+                    xargs[out] = self.outputs[out]
+
+                self.step(**xargs)
+
+                for mname in self.mems:
+                    next(self.mems[mname])
+
+            for out in self.outputs:
+                retv[out].append(self.outputs[out].last())
+
+        for out in retv:
+            retv[out] = numpy.array(retv[out])
+        return retv
+
+    def to_feed(self, place):
+        feed_dict = dict()
+
+        for iname in self.inputs:
+            lod = [0]
+            np_flatten = []
+            for seq_id in xrange(len(self.inputs[iname])):
+                seq_len = len(self.inputs[iname][seq_id])
+                lod.append(lod[-1] + seq_len)
+                np_flatten.extend(self.inputs[iname][seq_id])
+
+            t = fluid.Tensor()
+            t.set(numpy.array(np_flatten), place)
+            t.set_lod([lod])
+            feed_dict[iname] = t
+
+        for pname in self.params:
+            feed_dict[pname] = self.params[pname]
+        return feed_dict
+
+    def get_numeric_gradient_of_param(self, param_name, delta=0.001):
+        p = self.params[param_name]
+        if len(p.shape) != 2:
+            raise ValueError("Not support get numeric gradient of an parameter,"
+                             " which is not matrix")
+        g = numpy.zeros(shape=p.shape, dtype=p.dtype)
+
+        for i in xrange(p.shape[0]):
+            for j in xrange(p.shape[1]):
+                o = p[i][j]
+                p[i][j] += delta
+                pos = self._exe_mean_out_()
+                p[i][j] -= 2 * delta
+                neg = self._exe_mean_out_()
+                p[i][j] = o
+                g[i][j] = (pos - neg) / (delta * 2)
+        return g
+
+    def get_numeric_gradient_of_input(self,
+                                      input_name,
+                                      delta=0.001,
+                                      return_one_tensor=True):
+        ipt = self.inputs[input_name]
+        grad = []
+
+        for seq in ipt:
+            seq_grad = []
+            for item in seq:
+                item_grad = numpy.zeros(shape=item.shape, dtype=item.dtype)
+                if len(item.shape) != 1:
+                    raise ValueError("Not support")
+
+                for i in xrange(len(item)):
+                    o = item[i]
+                    item[i] += delta
+                    pos = self._exe_mean_out_()
+                    item[i] -= 2 * delta
+                    neg = self._exe_mean_out_()
+                    item[i] = o
+                    item_grad[i] = (pos - neg) / (delta * 2)
+                seq_grad.append(item_grad)
+            grad.append(seq_grad)
+
+        if not return_one_tensor:
+            return grad
+
+        for i in xrange(len(grad)):
+            grad[i] = numpy.concatenate(grad[i])
+        grad = numpy.concatenate(grad)
+        return grad
+
+    def _exe_mean_out_(self):
+        outs = self.exe()
+        return numpy.array([o.mean() for o in outs.itervalues()]).mean()
+
+
+class SeedFixedTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Fix random seeds to remove randomness from tests"""
+        cls._np_rand_state = numpy.random.get_state()
+        cls._py_rand_state = random.getstate()
+
+        numpy.random.seed(123)
+        random.seed(124)
+
+    @classmethod
+    def tearDownClass(cls):
+        """Restore random seeds"""
+        numpy.random.set_state(cls._np_rand_state)
+        random.setstate(cls._py_rand_state)
+
+
+class TestSimpleMul(SeedFixedTestCase):
+    DATA_NAME = 'X'
+    DATA_WIDTH = 32
+    PARAM_NAME = 'W'
+    HIDDEN_WIDTH = 10
+    OUT_NAME = 'Out'
+
+    class SimpleMul(BaseRNN):
+        def __init__(self):
+            base = TestSimpleMul
+            super(base.SimpleMul, self).__init__({
+                base.DATA_NAME: {
+                    'shape': [base.DATA_WIDTH]
+                }
+            }, {}, {
+                base.PARAM_NAME: {
+                    'shape': [base.DATA_WIDTH, base.HIDDEN_WIDTH]
+                }
+            }, [base.OUT_NAME])
+
+        def step(self, X, W, Out):
+            Out.out(numpy.matmul(X, W))
+
+    # Test many times in local to ensure the random seed cannot breaks CI
+    # @many_times(10)
+    @prog_scope()
+    def test_forward_backward(self):
+        py_rnn = TestSimpleMul.SimpleMul()
+        dat = fluid.layers.data(
+            name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1)
+        dat.stop_gradient = False
+
+        rnn = fluid.layers.DynamicRNN()
+        with rnn.block():
+            d = rnn.step_input(dat)
+            o = fluid.layers.fc(input=d,
+                                param_attr=self.PARAM_NAME,
+                                bias_attr=False,
+                                size=self.HIDDEN_WIDTH,
+                                act=None)
+            rnn.output(o)
+
+        out = rnn()
+        out = fluid.layers.sequence_pool(out, pool_type='last')
+        loss = fluid.layers.mean(x=out)
+        fluid.backward.append_backward(loss)
+
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        out, w_g, i_g = map(numpy.array,
+                            exe.run(feed=py_rnn.to_feed(cpu),
+                                    fetch_list=[
+                                        out, self.PARAM_NAME + "@GRAD",
+                                        self.DATA_NAME + "@GRAD"
+                                    ],
+                                    return_numpy=False))
+        out_by_python = py_rnn.exe()[self.OUT_NAME]
+        self.assertTrue(numpy.allclose(out, out_by_python))
+        w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
+        self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.05))
+        i_g_num = py_rnn.get_numeric_gradient_of_input(
+            input_name=self.DATA_NAME)
+        i_g_num = i_g_num.reshape(i_g.shape)
+        self.assertTrue(numpy.allclose(i_g_num, i_g, rtol=0.05))
+
+
+class TestSimpleMulWithMemory(SeedFixedTestCase):
+    DATA_WIDTH = 32
+    HIDDEN_WIDTH = 20
+    DATA_NAME = 'X'
+    PARAM_NAME = 'W'
+
+    class SimpleMulWithMemory(BaseRNN):
+        def __init__(self):
+            super(TestSimpleMulWithMemory.SimpleMulWithMemory, self).__init__({
+                TestSimpleMulWithMemory.DATA_NAME: {
+                    'shape': [TestSimpleMulWithMemory.DATA_WIDTH]
+                }
+            }, {'Mem': {
+                'shape': [TestSimpleMulWithMemory.HIDDEN_WIDTH]
+            }}, {
+                TestSimpleMulWithMemory.PARAM_NAME: {
+                    'shape': [
+                        TestSimpleMulWithMemory.DATA_WIDTH,
+                        TestSimpleMulWithMemory.HIDDEN_WIDTH
+                    ]
+                }
+            }, ['Out'])
+
+        def step(self, X, Mem, W, Out):
+            o = numpy.matmul(X, W)
+            assert isinstance(Mem, Memory)
+            o += Mem.ex
+            Mem.update(o)
+            assert isinstance(Out, Output)
+            Out.out(o)
+
+    # many_times used locally for debug. Make sure the calculation is stable.
+    # @many_times(10)
+    @prog_scope()
+    def test_forward_backward(self):
+        py_rnn = TestSimpleMulWithMemory.SimpleMulWithMemory()
+        data = fluid.layers.data(
+            name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1)
+        data.stop_gradient = False
+        rnn = fluid.layers.DynamicRNN()
+        with rnn.block():
+            d = rnn.step_input(data)
+            mem = rnn.memory(value=0.0, shape=[self.HIDDEN_WIDTH])
+            hidden = fluid.layers.fc(input=d,
+                                     size=self.HIDDEN_WIDTH,
+                                     param_attr=self.PARAM_NAME,
+                                     bias_attr=False,
+                                     act=None)
+            o = fluid.layers.elementwise_add(x=hidden, y=mem)
+            rnn.update_memory(mem, o)
+            rnn.output(o)
+
+        out = rnn()
+        last = fluid.layers.sequence_pool(input=out, pool_type='last')
+        loss = fluid.layers.mean(x=last)
+        fluid.backward.append_backward(loss)
+
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        feed = py_rnn.to_feed(cpu)
+        last_np, w_g, i_g = map(numpy.array,
+                                exe.run(feed=feed,
+                                        fetch_list=[
+                                            last, self.PARAM_NAME + "@GRAD",
+                                            self.DATA_NAME + "@GRAD"
+                                        ],
+                                        return_numpy=False))
+        last_by_py, = py_rnn.exe().values()
+        w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
+        self.assertTrue(numpy.allclose(last_np, last_by_py))
+
+        self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.1))
+        i_g_num = py_rnn.get_numeric_gradient_of_input(self.DATA_NAME)
+        i_g_num = i_g_num.reshape(i_g.shape)
+
+        # Since this RNN has many float add. The number could be not stable.
+        # rtol = 0.1
+        self.assertTrue(numpy.allclose(i_g_num, i_g, rtol=0.1))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_static_input.py b/python/paddle/v2/fluid/tests/test_dynrnn_static_input.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6878f0b6d07b74612dd47794e254e4d7d98a124
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_dynrnn_static_input.py
@@ -0,0 +1,205 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import paddle.v2 as paddle
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid as fluid
+from paddle.v2.fluid.backward import append_backward
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.framework import Program, switch_main_program
+import bisect
+import numpy as np
+
+fluid.default_startup_program().random_seed = 1
+
+
+class TestDyRnnStaticInput(unittest.TestCase):
+    def setUp(self):
+        self._delta = 0.005
+        self._max_sequence_len = 3
+        self._program = Program()
+        switch_main_program(self._program)
+        self.output_dim = 10
+        self.place = core.CPUPlace()
+        self.prepare_x_tensor()
+        self.prepare_static_input_tensor()
+        self.exe = fluid.Executor(self.place)
+
+    def prepare_x_tensor(self):
+        self.x_tensor_dim = 10
+        lod = [[0, 2, 3, 6]]
+        shape = [lod[0][-1], self.x_tensor_dim]
+        self.x_tensor_data = np.random.random(shape).astype('float32')
+        self.x_tensor = core.LoDTensor()
+        self.x_tensor.set_lod(lod)
+        self.x_tensor.set(self.x_tensor_data, self.place)
+
+    def prepare_static_input_tensor(self):
+        self.static_input_tensor_dim = 4
+        lod = [[0, 1, 3, 6]]
+        shape = [lod[0][-1], self.static_input_tensor_dim]
+        self.static_input_data = np.random.random(shape).astype('float32')
+        self.static_input_tensor = core.LoDTensor()
+        self.static_input_tensor.set_lod(lod)
+        self.static_input_tensor.set(self.static_input_data, self.place)
+
+    def fetch_value(self, var):
+        fetch_outs = self.exe.run(feed={
+            'x_tensor': self.x_tensor,
+            'static_input_tensor': self.static_input_tensor
+        },
+                                  fetch_list=[var],
+                                  return_numpy=False)
+        return self._lodtensor_to_ndarray(fetch_outs[0])
+
+    def _lodtensor_to_ndarray(self, lod_tensor):
+        dims = lod_tensor.get_dims()
+        ndarray = np.zeros(shape=dims).astype('float32')
+        for i in xrange(np.product(dims)):
+            ndarray.ravel()[i] = lod_tensor.get_float_element(i)
+        return ndarray, lod_tensor.lod()
+
+    def build_graph(self, only_forward=False):
+        x_tensor = fluid.layers.data(
+            name='x_tensor',
+            shape=[self.x_tensor_dim],
+            dtype='float32',
+            lod_level=1)
+        x_tensor.stop_gradient = False
+
+        static_input_tensor = fluid.layers.data(
+            name='static_input_tensor',
+            shape=[self.static_input_tensor_dim],
+            dtype='float32',
+            lod_level=1)
+        static_input_tensor.stop_gradient = False
+
+        if only_forward:
+            static_input_out_array = self._program.global_block().create_var(
+                name='static_input_out_array',
+                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                dtype='float32')
+            static_input_out_array.stop_gradient = True
+
+        rnn = fluid.layers.DynamicRNN()
+        with rnn.block():
+            step_x = rnn.step_input(x_tensor)
+            step_static_input = rnn.static_input(static_input_tensor)
+            if only_forward:
+                fluid.layers.array_write(
+                    x=step_static_input,
+                    i=rnn.step_idx,
+                    array=static_input_out_array)
+            last = fluid.layers.sequence_pool(
+                input=step_static_input, pool_type='last')
+            projected = fluid.layers.fc(input=[step_x, last],
+                                        size=self.output_dim)
+            rnn.output(projected)
+
+        if only_forward:
+            static_input_step_outs = []
+            step_idx = fluid.layers.fill_constant(
+                shape=[1], dtype='int64', value=0)
+            step_idx.stop_gradient = True
+
+            for i in xrange(self._max_sequence_len):
+                step_out = fluid.layers.array_read(static_input_out_array,
+                                                   step_idx)
+                step_out.stop_gradient = True
+                static_input_step_outs.append(step_out)
+                fluid.layers.increment(x=step_idx, value=1.0, in_place=True)
+
+        if only_forward:
+            return static_input_step_outs
+
+        last = fluid.layers.sequence_pool(input=rnn(), pool_type='last')
+        loss = fluid.layers.mean(x=last)
+        append_backward(loss)
+        static_input_grad = self._program.global_block().var(
+            framework.grad_var_name('static_input_tensor'))
+        return static_input_grad, loss
+
+    def get_seq_len_from_lod(self, lod):
+        return [lod[0][i + 1] - lod[0][i] for i in xrange(len(lod[0]) - 1)]
+
+    def get_expected_static_step_outs(self):
+        x_lod = self.x_tensor.lod()
+        x_seq_len = self.get_seq_len_from_lod(x_lod)
+        x_seq_len_sorted = sorted(x_seq_len)
+        x_sorted_indices = np.argsort(x_seq_len)[::-1]
+
+        static_lod = self.static_input_tensor.lod()
+        static_sliced = [
+            self.static_input_data[static_lod[0][i]:static_lod[0][i + 1]]
+            for i in xrange(len(static_lod[0]) - 1)
+        ]
+        static_seq_len = self.get_seq_len_from_lod(static_lod)
+        static_reordered = []
+        for i in xrange(len(x_sorted_indices)):
+            static_reordered.extend(static_sliced[x_sorted_indices[i]].tolist())
+        static_seq_len_reordered = [
+            static_seq_len[x_sorted_indices[i]]
+            for i in xrange(len(x_sorted_indices))
+        ]
+
+        static_step_outs = []
+        static_step_lods = []
+
+        for i in xrange(self._max_sequence_len):
+            end = len(x_seq_len) - bisect.bisect_left(x_seq_len_sorted, i + 1)
+            lod = [0]
+            for i in xrange(end):
+                lod.append(static_seq_len_reordered[i] + lod[-1])
+            static_step_lods.append([lod])
+            end = lod[-1]
+            static_step_outs.append(
+                np.array(static_reordered[:end]).astype('float32'))
+
+        return static_step_outs, static_step_lods
+
+    def test_step_out(self):
+        static_step_outs = self.build_graph(only_forward=True)
+        self.exe.run(framework.default_startup_program())
+        expected_outs, expected_lods = self.get_expected_static_step_outs()
+        for i in xrange(self._max_sequence_len):
+            step_out, lod = self.fetch_value(static_step_outs[i])
+            self.assertTrue(np.allclose(step_out, expected_outs[i]))
+            self.assertTrue(np.allclose(lod, expected_lods[i]))
+
+    def test_network_gradient(self):
+        static_input_grad, loss = self.build_graph()
+        self.exe.run(framework.default_startup_program())
+
+        actual_gradients, actual_lod = self.fetch_value(static_input_grad)
+
+        static_input_shape = self.static_input_tensor.get_dims()
+        numeric_gradients = np.zeros(shape=static_input_shape).astype('float32')
+        # calculate numeric gradients
+        tensor_size = np.product(static_input_shape)
+        for i in xrange(tensor_size):
+            origin = self.static_input_tensor.get_float_element(i)
+            x_pos = origin + self._delta
+            self.static_input_tensor.set_float_element(i, x_pos)
+            y_pos = self.fetch_value(loss)[0][0]
+            x_neg = origin - self._delta
+            self.static_input_tensor.set_float_element(i, x_neg)
+            y_neg = self.fetch_value(loss)[0][0]
+            self.static_input_tensor.set_float_element(i, origin)
+            numeric_gradients.ravel()[i] = (y_pos - y_neg) / self._delta / 2
+        self.assertTrue(np.allclose(actual_gradients, numeric_gradients, 0.001))
+        self.assertTrue(np.allclose(actual_lod, self.static_input_tensor.lod()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_edit_distance_op.py b/python/paddle/v2/fluid/tests/test_edit_distance_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..38e87728b387bb70a8921a2fe73a4e69701aabe9
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_edit_distance_op.py
@@ -0,0 +1,94 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def Levenshtein(hyp, ref):
+    """ Compute the Levenshtein distance between two strings.
+
+    :param hyp: hypothesis string in index
+    :type hyp: list
+    :param ref: reference string in index
+    :type ref: list
+    """
+    m = len(hyp)
+    n = len(ref)
+    if m == 0:
+        return n
+    if n == 0:
+        return m
+
+    dist = np.zeros((m + 1, n + 1)).astype("float32")
+    for i in range(0, m + 1):
+        dist[i][0] = i
+    for j in range(0, n + 1):
+        dist[0][j] = j
+
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            cost = 0 if hyp[i - 1] == ref[j - 1] else 1
+            deletion = dist[i - 1][j] + 1
+            insertion = dist[i][j - 1] + 1
+            substitution = dist[i - 1][j - 1] + cost
+            dist[i][j] = min(deletion, insertion, substitution)
+    return dist[m][n]
+
+
+class TestEditDistanceOp(OpTest):
+    def setUp(self):
+        self.op_type = "edit_distance"
+        normalized = False
+        x1 = np.array([[0, 12, 3, 5, 8, 2]]).astype("int32")
+        x2 = np.array([[0, 12, 4, 7, 8]]).astype("int32")
+        x1 = np.transpose(x1)
+        x2 = np.transpose(x2)
+        x1_lod = [0, 1, 5]
+        x2_lod = [0, 3, 4]
+
+        num_strs = len(x1_lod) - 1
+        distance = np.zeros((num_strs, 1)).astype("float32")
+        for i in range(0, num_strs):
+            distance[i] = Levenshtein(
+                hyp=x1[x1_lod[i]:x1_lod[i + 1]],
+                ref=x2[x2_lod[i]:x2_lod[i + 1]])
+            if normalized is True:
+                len_ref = x2_lod[i + 1] - x2_lod[i]
+                distance[i] = distance[i] / len_ref
+        self.attrs = {'normalized': normalized}
+        self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
+        self.outputs = {'Out': distance}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestEditDistanceOpNormalized(OpTest):
+    def setUp(self):
+        self.op_type = "edit_distance"
+        normalized = True
+        x1 = np.array([[0, 10, 3, 6, 5, 8, 2]]).astype("int32")
+        x2 = np.array([[0, 10, 4, 6, 7, 8]]).astype("int32")
+        x1 = np.transpose(x1)
+        x2 = np.transpose(x2)
+        x1_lod = [0, 1, 3, 6]
+        x2_lod = [0, 2, 3, 5]
+
+        num_strs = len(x1_lod) - 1
+        distance = np.zeros((num_strs, 1)).astype("float32")
+        for i in range(0, num_strs):
+            distance[i] = Levenshtein(
+                hyp=x1[x1_lod[i]:x1_lod[i + 1]],
+                ref=x2[x2_lod[i]:x2_lod[i + 1]])
+            if normalized is True:
+                len_ref = x2_lod[i + 1] - x2_lod[i]
+                distance[i] = distance[i] / len_ref
+        self.attrs = {'normalized': normalized}
+        self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
+        self.outputs = {'Out': distance}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_add_op.py b/python/paddle/v2/fluid/tests/test_elementwise_add_op.py
index 57daddd5698f77527bc5b78c436065a851867ae0..1e88231877f869052d859a30fbbd8f7690b64095 100644
--- a/python/paddle/v2/fluid/tests/test_elementwise_add_op.py
+++ b/python/paddle/v2/fluid/tests/test_elementwise_add_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_div_op.py b/python/paddle/v2/fluid/tests/test_elementwise_div_op.py
index 41cb2b7767eb8e01e46e770a5da21b609f4eb911..fbabc79be2406efa916d2acafe9ce3d69112b160 100644
--- a/python/paddle/v2/fluid/tests/test_elementwise_div_op.py
+++ b/python/paddle/v2/fluid/tests/test_elementwise_div_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_mul_op.py b/python/paddle/v2/fluid/tests/test_elementwise_mul_op.py
index 261ca9cb3da90dee91b016fee98f67b4c19356a1..ef3a829abc65f3af60b899ba50927424fd563607 100644
--- a/python/paddle/v2/fluid/tests/test_elementwise_mul_op.py
+++ b/python/paddle/v2/fluid/tests/test_elementwise_mul_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_sub_op.py b/python/paddle/v2/fluid/tests/test_elementwise_sub_op.py
index be982e8c57b30b91c2834bd5db38ea3c89f573ee..db24db7b304e2ef8b946e128b22ed62d71260ddc 100644
--- a/python/paddle/v2/fluid/tests/test_elementwise_sub_op.py
+++ b/python/paddle/v2/fluid/tests/test_elementwise_sub_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_exception.py b/python/paddle/v2/fluid/tests/test_exception.py
index b871f40c4a07ae2db7559e5a0f15664b21e94402..98c4cbe3f2f2ec3a2036a3fdb84aa5ee4600695c 100644
--- a/python/paddle/v2/fluid/tests/test_exception.py
+++ b/python/paddle/v2/fluid/tests/test_exception.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import paddle.v2.fluid.core as core
 import unittest
 
diff --git a/python/paddle/v2/fluid/tests/test_executor_and_mul.py b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
index 709250d0c86dde84ac22c37d8e2385ca4a80a40a..e8baf631e52e4acf645b90b0a9d319d296354944 100644
--- a/python/paddle/v2/fluid/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
@@ -1,33 +1,40 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
-from paddle.v2.fluid.layers import mul, data
+
+import numpy
 import paddle.v2.fluid.core as core
+
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.framework import g_main_program
-import numpy
+from paddle.v2.fluid.layers import mul, data
 
 
 class TestExecutor(unittest.TestCase):
     def test_mul(self):
-        a = data(name='a', shape=[784], data_type='float32')
+        a = data(name='a', shape=[784], dtype='float32')
         b = data(
             name='b',
             shape=[784, 100],
-            data_type='float32',
+            dtype='float32',
             append_batch_size=False)
         out = mul(x=a, y=b)
         place = core.CPUPlace()
         a_np = numpy.random.random((100, 784)).astype('float32')
-        tensor_a = core.LoDTensor()
-        tensor_a.set(a_np, place)
         b_np = numpy.random.random((784, 100)).astype('float32')
-        tensor_b = core.LoDTensor()
-        tensor_b.set(b_np, place)
         exe = Executor(place)
-        outs = exe.run(g_main_program,
-                       feed={'a': tensor_a,
-                             'b': tensor_b},
-                       fetch_list=[out])
-        out = numpy.array(outs[0])
+        outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out])
+        out = outs[0]
         self.assertEqual((100, 100), out.shape)
         self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
 
diff --git a/python/paddle/v2/fluid/tests/test_expand_op.py b/python/paddle/v2/fluid/tests/test_expand_op.py
index 0440f7a2bb159bab4923683b5d0980e59e0a69c9..0524f2041fb1b40e5214039fe72d672c1a3691d6 100644
--- a/python/paddle/v2/fluid/tests/test_expand_op.py
+++ b/python/paddle/v2/fluid/tests/test_expand_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_feed_fetch_method.py b/python/paddle/v2/fluid/tests/test_feed_fetch_method.py
index 178c85b0dd50df61b1fd35ef5d53ebbf39445cb4..718311517dfee51f4e3c724074cd0018a3fa757c 100644
--- a/python/paddle/v2/fluid/tests/test_feed_fetch_method.py
+++ b/python/paddle/v2/fluid/tests/test_feed_fetch_method.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import paddle.v2.fluid.core as core
 import unittest
 import numpy as np
diff --git a/python/paddle/v2/fluid/tests/test_fill_constant_batch_size_like_op.py b/python/paddle/v2/fluid/tests/test_fill_constant_batch_size_like_op.py
index 99de6b5d052b41499800afb6181a235da340bc15..0adc487c04ae4af7ab052530b1d3ca75ec1eddbd 100644
--- a/python/paddle/v2/fluid/tests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/v2/fluid/tests/test_fill_constant_batch_size_like_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_fill_constant_op.py b/python/paddle/v2/fluid/tests/test_fill_constant_op.py
index dff7b615aa378b0ef932df47241db07eace61a86..50d4ccb3bdd115f8fa5ea53d5751426cac44cedb 100644
--- a/python/paddle/v2/fluid/tests/test_fill_constant_op.py
+++ b/python/paddle/v2/fluid/tests/test_fill_constant_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_fill_op.py b/python/paddle/v2/fluid/tests/test_fill_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..42b06ec87c2b1811aa678f0563522f3d90405478
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_fill_op.py
@@ -0,0 +1,37 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.v2.fluid.core as core
+
+
+class TestFillOp(OpTest):
+    def setUp(self):
+        self.op_type = "fill"
+        val = np.random.random(size=[100, 200])
+        self.inputs = {}
+        self.attrs = {
+            'value': val.flatten().tolist(),
+            'shape': [100, 200],
+            'dtype': int(core.DataType.FP64)
+        }
+        self.outputs = {'Out': val.astype('float64')}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py b/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
index eff8fa87d9c0dafc6935604101e94ee6c8b081ce..a28bed9697309da29dce1220dc8080e0ad2083fd 100644
--- a/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
+++ b/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -7,7 +20,7 @@ class TestFillZerosLikeOp(OpTest):
     def setUp(self):
         self.op_type = "fill_zeros_like"
         self.inputs = {'X': np.random.random((219, 232)).astype("float32")}
-        self.outputs = {'Y': np.zeros_like(self.inputs["X"])}
+        self.outputs = {'Out': np.zeros_like(self.inputs["X"])}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/fluid/tests/test_framework_debug_str.py b/python/paddle/v2/fluid/tests/test_framework_debug_str.py
index a4cbabdb36362c4ca14b76f366b648d6dbdbf7b3..6c82e67220f36217d88d2b7f73ede8f85e11d632 100644
--- a/python/paddle/v2/fluid/tests/test_framework_debug_str.py
+++ b/python/paddle/v2/fluid/tests/test_framework_debug_str.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 from paddle.v2.fluid.framework import Program
 
diff --git a/python/paddle/v2/fluid/tests/test_ftrl_op.py b/python/paddle/v2/fluid/tests/test_ftrl_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..599233efd93d91514171c46f70994bc45c9c6722
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_ftrl_op.py
@@ -0,0 +1,75 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestFTRLOp(OpTest):
+    def setUp(self):
+        self.op_type = "ftrl"
+        w = np.random.random((102, 105)).astype("float32")
+        g = np.random.random((102, 105)).astype("float32")
+        sq_accum = np.full((102, 105), 0.1).astype("float32")
+        linear_accum = np.full((102, 105), 0.1).astype("float32")
+        lr = np.array([0.01]).astype("float32")
+        l1 = 0.1
+        l2 = 0.2
+        lr_power = -0.5
+
+        self.inputs = {
+            'Param': w,
+            'SquaredAccumulator': sq_accum,
+            'LinearAccumulator': linear_accum,
+            'Grad': g,
+            'LearningRate': lr
+        }
+        self.attrs = {
+            'l1': l1,
+            'l2': l2,
+            'lr_power': lr_power,
+            'learning_rate': lr
+        }
+        new_accum = sq_accum + g * g
+        if lr_power == -0.5:
+            linear_out = linear_accum + g - (
+                (np.sqrt(new_accum) - np.sqrt(sq_accum)) / lr) * w
+        else:
+            linear_out = linear_accum + g - ((np.power(
+                new_accum, -lr_power) - np.power(sq_accum, -lr_power)) / lr) * w
+
+        x = (l1 * np.sign(linear_out) - linear_out)
+        if lr_power == -0.5:
+            y = (np.sqrt(new_accum) / lr) + (2 * l2)
+            pre_shrink = x / y
+            param_out = np.where(np.abs(linear_out) > l1, pre_shrink, 0.0)
+        else:
+            y = (np.power(new_accum, -lr_power) / lr) + (2 * l2)
+            pre_shrink = x / y
+            param_out = np.where(np.abs(linear_out) > l1, pre_shrink, 0.0)
+
+        sq_accum_out = sq_accum + g * g
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'SquaredAccumOut': sq_accum_out,
+            'LinearAccumOut': linear_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_gather_op.py b/python/paddle/v2/fluid/tests/test_gather_op.py
index b0ab429ef1b53640dfb696f6ea2f7b745564b874..95093f9b846b7bfef4ad3b3dd777c5f7980fe83a 100644
--- a/python/paddle/v2/fluid/tests/test_gather_op.py
+++ b/python/paddle/v2/fluid/tests/test_gather_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
index 627ab4e23562f14538d85f2e21edeb7d72d940bb..bf4785211e8e6d4af97f49d67847df7ac72dfa71 100644
--- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
@@ -1,32 +1,59 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
+import numpy
+
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.op import Operator
-import numpy
+from paddle.v2.fluid.executor import Executor
 
 
 class TestGaussianRandomOp(unittest.TestCase):
+    def setUp(self):
+        self.op_type = "gaussian_random"
+        self.inputs = {}
+        self.attrs = {"shape": [1000, 784], "mean": .0, "std": 1., "seed": 10}
+
+        self.outputs = ["Out"]
+
     def test_cpu(self):
-        self.gaussian_random_test(place=core.CPUPlace())
+        self.gaussian_random_test(place=fluid.CPUPlace())
 
     def test_gpu(self):
         if core.is_compile_gpu():
-            self.gaussian_random_test(place=core.GPUPlace(0))
+            self.gaussian_random_test(place=fluid.CUDAPlace(0))
 
     def gaussian_random_test(self, place):
-        scope = core.Scope()
-        scope.var('Out').get_tensor()
-
-        op = Operator(
-            "gaussian_random",
-            Out='Out',
-            shape=[1000, 784],
-            mean=.0,
-            std=1.,
-            seed=10)
-
-        context = core.DeviceContext.create(place)
-        op.run(scope, context)
-        tensor = numpy.array(scope.find_var('Out').get_tensor())
+
+        program = fluid.Program()
+        block = program.global_block()
+        vout = block.create_var(name="Out")
+        op = block.append_op(
+            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
+
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+
+        fetch_list = []
+        for var_name in self.outputs:
+            fetch_list.append(block.var(var_name))
+
+        exe = Executor(place)
+        outs = exe.run(program, fetch_list=fetch_list)
+        tensor = outs[0]
+
         self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
         self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
 
diff --git a/python/paddle/v2/fluid/tests/test_get_places_op.py b/python/paddle/v2/fluid/tests/test_get_places_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b44011fb76be712d11bbd72ce95027a439a4d2c1
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_get_places_op.py
@@ -0,0 +1,30 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import paddle.v2.fluid as fluid
+import decorators
+import unittest
+
+
+class TestGetPlaces(unittest.TestCase):
+    @decorators.prog_scope()
+    def test_get_places(self):
+        places = fluid.layers.get_places()
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        exe.run(fluid.default_main_program())
+        self.assertEqual(places.type, fluid.core.VarDesc.VarType.PLACE_LIST)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_gru_op.py b/python/paddle/v2/fluid/tests/test_gru_op.py
index fa2c5a53ec4a01b6545e25f773c11277a4d24706..a6647d1bf28b8b4412d40f795d59eb526e0b7781 100644
--- a/python/paddle/v2/fluid/tests/test_gru_op.py
+++ b/python/paddle/v2/fluid/tests/test_gru_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 import math
diff --git a/python/paddle/v2/fluid/tests/test_gru_unit_op.py b/python/paddle/v2/fluid/tests/test_gru_unit_op.py
index f356f6e9ec0da2d3e1fb67638d81e8d54c544f53..53f10c32c7cd143b89da77e24555a7f59473f3a1 100644
--- a/python/paddle/v2/fluid/tests/test_gru_unit_op.py
+++ b/python/paddle/v2/fluid/tests/test_gru_unit_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import math
 import unittest
 import numpy as np
@@ -28,8 +41,8 @@ def relu(x):
 
 
 class TestGRUUnitOp(OpTest):
-    batch_size = 3
-    frame_size = 5
+    batch_size = 5
+    frame_size = 10
     activate = {
         GRUActivationType.identity: identity,
         GRUActivationType.sigmoid: sigmoid,
@@ -77,7 +90,7 @@ class TestGRUUnitOp(OpTest):
         c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
                                                     g[:, frame_size * 2:])
         g = np.hstack((u_r, c))
-        h = u * h_p + (1 - u) * c
+        h = u * c + (1 - u) * h_p
         self.outputs = {
             'Gate': g.astype('float64'),
             'ResetHiddenPrev': r_h_p.astype('float64'),
@@ -92,10 +105,7 @@ class TestGRUUnitOp(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(
-            ['Input', 'HiddenPrev', 'Weight'],
-            ['Hidden', 'ResetHiddenPrev', 'Gate'],
-            max_relative_error=0.007)
+        self.check_grad(['Input', 'HiddenPrev', 'Weight'], ['Hidden'])
 
 
 class TestGRUUnitOpWithBias(TestGRUUnitOp):
@@ -104,18 +114,20 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp):
         frame_size = self.frame_size
         super(TestGRUUnitOpWithBias, self).set_inputs()
         self.inputs['Bias'] = np.random.uniform(
-            -0.1, 0.1, (1, frame_size * 3)).astype('float32')
+            -0.1, 0.1, (1, frame_size * 3)).astype('float64')
         self.attrs = {
             'activation': GRUActivationType.identity,
             'gate_activation': GRUActivationType.sigmoid
         }
 
     def test_check_grad(self):
+        self.check_grad(['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'])
+
+    def test_check_grad_ingore_input(self):
         self.check_grad(
-            ['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'],
-            max_relative_error=0.007)
+            ['HiddenPrev', 'Weight', 'Bias'], ['Hidden'],
+            no_grad_set=set('Input'))
 
 
 if __name__ == '__main__':
-    exit(0)  # FIXME(yuyang18): This unittest is not pass. Fix it later
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_hinge_loss_op.py b/python/paddle/v2/fluid/tests/test_hinge_loss_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc7774d01c0a6d6a80d0e914cd327e1a4f9919e9
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_hinge_loss_op.py
@@ -0,0 +1,41 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestHingeLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'hinge_loss'
+        samples_num = 64
+        logits = np.random.uniform(-10, 10, (samples_num, 1)).astype('float32')
+        labels = np.random.randint(0, 2, (samples_num, 1)).astype('float32')
+
+        self.inputs = {
+            'Logits': logits,
+            'Labels': labels,
+        }
+        loss = np.maximum(1.0 - (2 * labels - 1) * logits, 0)
+        self.outputs = {'Loss': loss}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Logits'], 'Loss', max_relative_error=0.008)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_huber_loss_op.py b/python/paddle/v2/fluid/tests/test_huber_loss_op.py
index a24fcbec6cc4801118ce4ef97eb4692cd2351c28..18a48bb18ced83d79e602e0103a3e8d0a87c66ef 100644
--- a/python/paddle/v2/fluid/tests/test_huber_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_huber_loss_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_image_classification_layer.py b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
index bf5444107fa1609e67b09823b82e5fb92234b0a4..9d676e87594dfad7ede3ac237effb234f44e8369 100644
--- a/python/paddle/v2/fluid/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
@@ -1,16 +1,24 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 
-import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.framework import Program
 
 
-def conv_block(input,
-               num_filter,
-               groups,
-               dropouts,
-               main_program=None,
-               startup_program=None):
+def conv_block(input, num_filter, groups, dropouts):
     return nets.img_conv_group(
         input=input,
         pool_size=2,
@@ -20,82 +28,54 @@ def conv_block(input,
         conv_act='relu',
         conv_with_batchnorm=True,
         conv_batchnorm_drop_rate=dropouts,
-        pool_type='max',
-        main_program=main_program,
-        startup_program=startup_program)
+        pool_type='max')
 
 
 class TestLayer(unittest.TestCase):
     def test_batch_norm_layer(self):
         main_program = Program()
         startup_program = Program()
-        images = layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            data_type='float32',
-            main_program=main_program)
-        layers.batch_norm(
-            input=images,
-            main_program=main_program,
-            startup_program=startup_program)
+        with fluid.program_guard(main_program, startup_program):
+            images = fluid.layers.data(
+                name='pixel', shape=[3, 48, 48], dtype='float32')
+            hidden1 = fluid.layers.batch_norm(input=images)
+            hidden2 = fluid.layers.fc(input=hidden1, size=128, act='relu')
+            fluid.layers.batch_norm(input=hidden2)
 
-        # print str(main_program)
+        print str(main_program)
 
     def test_dropout_layer(self):
         main_program = Program()
         startup_program = Program()
-        images = layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            data_type='float32',
-            main_program=main_program)
-        layers.dropout(
-            x=images,
-            dropout_prob=0.5,
-            main_program=main_program,
-            startup_program=startup_program)
+        with fluid.program_guard(main_program, startup_program):
+            images = fluid.layers.data(
+                name='pixel', shape=[3, 48, 48], dtype='float32')
+            fluid.layers.dropout(x=images, dropout_prob=0.5)
 
-        # print str(main_program)
+        print str(main_program)
 
     def test_img_conv_group(self):
         main_program = Program()
         startup_program = Program()
 
-        images = layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            data_type='float32',
-            main_program=main_program,
-            startup_program=startup_program)
-        conv1 = conv_block(images, 64, 2, [0.3, 0], main_program,
-                           startup_program)
-        conv2 = conv_block(conv1, 256, 3, [0.4, 0.4, 0], main_program,
-                           startup_program)
+        with fluid.program_guard(main_program, startup_program):
+            images = fluid.layers.data(
+                name='pixel', shape=[3, 48, 48], dtype='float32')
+            conv1 = conv_block(images, 64, 2, [0.3, 0])
+            conv_block(conv1, 256, 3, [0.4, 0.4, 0])
 
-        # print str(main_program)
+        print str(main_program)
 
     def test_elementwise_add_with_act(self):
         main_program = Program()
         startup_program = Program()
-        image1 = layers.data(
-            name='pixel1',
-            shape=[3, 48, 48],
-            data_type='float32',
-            main_program=main_program,
-            startup_program=startup_program)
-        image2 = layers.data(
-            name='pixel2',
-            shape=[3, 48, 48],
-            data_type='float32',
-            main_program=main_program,
-            startup_program=startup_program)
-        out = layers.elementwise_add(
-            x=image1,
-            y=image2,
-            act='relu',
-            main_program=main_program,
-            startup_program=startup_program)
-        # print(main_program)
+        with fluid.program_guard(main_program, startup_program):
+            image1 = fluid.layers.data(
+                name='pixel1', shape=[3, 48, 48], dtype='float32')
+            image2 = fluid.layers.data(
+                name='pixel2', shape=[3, 48, 48], dtype='float32')
+            fluid.layers.elementwise_add(x=image1, y=image2, act='relu')
+        print(main_program)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_infer_shape.py b/python/paddle/v2/fluid/tests/test_infer_shape.py
index 9f6695ce02de749178046fbb613a58ba591b3dbc..0c2a6f1423c45b3a5c8cb9f0d9b3d7004f997b36 100644
--- a/python/paddle/v2/fluid/tests/test_infer_shape.py
+++ b/python/paddle/v2/fluid/tests/test_infer_shape.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 
 import paddle.v2.fluid.core as core
diff --git a/python/paddle/v2/fluid/tests/test_inference_model_io.py b/python/paddle/v2/fluid/tests/test_inference_model_io.py
index 98b95713b73e8eba93bd6a58eaaed603cfae7952..c5cad2166bd17241c6c86e5cd0614df36ad2961c 100644
--- a/python/paddle/v2/fluid/tests/test_inference_model_io.py
+++ b/python/paddle/v2/fluid/tests/test_inference_model_io.py
@@ -1,13 +1,26 @@
-import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+
+import numpy as np
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 
-from paddle.v2.fluid.framework import Program
-from paddle.v2.fluid.io import save_inference_model, load_inference_model
 import paddle.v2.fluid.executor as executor
-import unittest
-import numpy as np
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.optimizer as optimizer
+from paddle.v2.fluid.framework import Program, program_guard
+from paddle.v2.fluid.io import save_inference_model, load_inference_model
 
 
 class TestBook(unittest.TestCase):
@@ -16,35 +29,18 @@ class TestBook(unittest.TestCase):
 
         init_program = Program()
         program = Program()
-        x = layers.data(
-            name='x',
-            shape=[2],
-            data_type='float32',
-            main_program=program,
-            startup_program=init_program)
-        y = layers.data(
-            name='y',
-            shape=[1],
-            data_type='float32',
-            main_program=program,
-            startup_program=init_program)
-
-        y_predict = layers.fc(input=x,
-                              size=1,
-                              act=None,
-                              main_program=program,
-                              startup_program=init_program)
-
-        cost = layers.square_error_cost(
-            input=y_predict,
-            label=y,
-            main_program=program,
-            startup_program=init_program)
-        avg_cost = layers.mean(
-            x=cost, main_program=program, startup_program=init_program)
-
-        sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-        opts = sgd_optimizer.minimize(avg_cost, init_program)
+
+        with program_guard(program, init_program):
+            x = layers.data(name='x', shape=[2], dtype='float32')
+            y = layers.data(name='y', shape=[1], dtype='float32')
+
+            y_predict = layers.fc(input=x, size=1, act=None)
+
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(x=cost)
+
+            sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+            sgd_optimizer.minimize(avg_cost, init_program)
 
         place = core.CPUPlace()
         exe = executor.Executor(place)
@@ -52,25 +48,20 @@ class TestBook(unittest.TestCase):
         exe.run(init_program, feed={}, fetch_list=[])
 
         for i in xrange(100):
-            x_data = np.array(
+            tensor_x = np.array(
                 [[1, 1], [1, 2], [3, 4], [5, 2]]).astype("float32")
-            y_data = np.array([[-2], [-3], [-7], [-7]]).astype("float32")
+            tensor_y = np.array([[-2], [-3], [-7], [-7]]).astype("float32")
 
-            tensor_x = core.LoDTensor()
-            tensor_x.set(x_data, place)
-            tensor_y = core.LoDTensor()
-            tensor_y.set(y_data, place)
             exe.run(program,
                     feed={'x': tensor_x,
                           'y': tensor_y},
                     fetch_list=[avg_cost])
 
         save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, program)
-        outs = exe.run(program,
-                       feed={'x': tensor_x,
-                             'y': tensor_y},
-                       fetch_list=[avg_cost])
-        expected = np.array(outs[0])
+        expected = exe.run(program,
+                           feed={'x': tensor_x,
+                                 'y': tensor_y},
+                           fetch_list=[avg_cost])[0]
 
         reload(executor)  # reload to build a new scope
         exe = executor.Executor(place)
@@ -83,7 +74,7 @@ class TestBook(unittest.TestCase):
             feed={feed_var_names[0]: tensor_x,
                   feed_var_names[1]: tensor_y},
             fetch_list=fetch_vars)
-        actual = np.array(outs[0])
+        actual = outs[0]
 
         self.assertEqual(feed_var_names, ["x", "y"])
         self.assertEqual(len(fetch_vars), 1)
diff --git a/python/paddle/v2/fluid/tests/test_initializer.py b/python/paddle/v2/fluid/tests/test_initializer.py
index f2eb79b209627f5814847db6d96c0a17300d9b5a..fa3c2afeedca4247379a34dea957bcceb9f134e1 100644
--- a/python/paddle/v2/fluid/tests/test_initializer.py
+++ b/python/paddle/v2/fluid/tests/test_initializer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import numpy as np
 import unittest
 
@@ -60,6 +73,29 @@ class TestUniformInitializer(unittest.TestCase):
         self.assertAlmostEqual(init_op.attr('max'), 1.0, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 0)
 
+    def test_uniform_initializer_random_seed(self):
+        """Test the uniform initializer with manually setting seed
+        """
+        program = framework.Program()
+        program.random_seed = 123
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.UniformInitializer())
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.UniformInitializer(seed=456))
+        init_op = block.ops[1]
+        self.assertEqual(init_op.attr("seed"), 123)
+        init_op1 = block.ops[0]
+        self.assertEqual(init_op1.attr("seed"), 456)
+
     def test_uniform_initializer(self):
         """Test uniform initializer with supplied attributes
         """
@@ -223,5 +259,109 @@ class TestXavierInitializer(unittest.TestCase):
         self.assertEqual(init_op.attr('seed'), 134)
 
 
+class TestMSRAInitializer(unittest.TestCase):
+    def test_uniform_msra_initializer(self):
+        """Test MSRA initializer with uniform distribution on
+           for matrix multiply.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        limit = np.sqrt(6.0 / param.shape[0])
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_uniform_msra_initializer_conv(self):
+        """Test MSRA initializer with uniform distribution on
+           for convolutions.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10, 15, 20],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        receptive_field_size = float(15 * 20)
+        limit = np.sqrt(6.0 / (param.shape[1] * receptive_field_size))
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_normal_msra_initializer(self):
+        """Test MSRA initializer with normal distribution on
+           for matrix multiply.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer(uniform=False))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'gaussian_random')
+        std = np.sqrt(2.0 / param.shape[0])
+        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_normal_msra_initializer_conv(self):
+        """Test MSRA initializer with normal distribution on
+           for convolutions.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10, 15, 20],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer(uniform=False))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'gaussian_random')
+        receptive_field_size = float(15 * 20)
+        std = np.sqrt(2.0 / (param.shape[1] * receptive_field_size))
+        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_msra_initializer_supplied_arguments(self):
+        """Test the MSRA initializer with supplied arguments
+        """
+        program = framework.Program()
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer(
+                fan_in=12, seed=134))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        limit = np.sqrt(6.0 / 12)
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 134)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_is_empty_op.py b/python/paddle/v2/fluid/tests/test_is_empty_op.py
index ed6e3fe24f6333c9c90d760787eb13241a7e1868..d6876a885f53ae2dc574871013d570a801b73c7f 100644
--- a/python/paddle/v2/fluid/tests/test_is_empty_op.py
+++ b/python/paddle/v2/fluid/tests/test_is_empty_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from paddle.v2.fluid.op import Operator
@@ -33,8 +46,7 @@ class TestIsEmptyOp(unittest.TestCase):
 
     def one_case(self, input, target):
         op = Operator(type="is_empty", X=input, Out="out")
-        ctx = core.DeviceContext.create(core.CPUPlace())
-        op.run(self.scope, ctx)
+        op.run(self.scope, core.CPUPlace())
         out = self.scope.var("out").get_tensor()
         self.assertEqual(np.array(out)[0], target)
 
diff --git a/python/paddle/v2/fluid/tests/test_l1_norm_op.py b/python/paddle/v2/fluid/tests/test_l1_norm_op.py
index 3a1d1689fe6f941e95ca2df171a1e8e03278076d..92484c49f03ebad6f07ff2613c346545d15f1d72 100644
--- a/python/paddle/v2/fluid/tests/test_l1_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_l1_norm_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import numpy as np
 import unittest
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 3d18e7ce3a4dc6c6b917a1000de39fca71f6ac18..a4e155b534a41e385167e6a6f01e32cfedf580e2 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -1,170 +1,220 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import print_function
+import unittest
+
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
-from paddle.v2.fluid.framework import Program
-import paddle.v2.fluid.core as core
-import unittest
+from paddle.v2.fluid.framework import Program, program_guard
+from paddle.v2.fluid.param_attr import ParamAttr
 
 
 class TestBook(unittest.TestCase):
     def test_fit_a_line(self):
         program = Program()
-        x = layers.data(
-            name='x', shape=[13], data_type='float32', main_program=program)
-        y_predict = layers.fc(input=x, size=1, act=None, main_program=program)
-
-        y = layers.data(
-            name='y', shape=[1], data_type='float32', main_program=program)
-        cost = layers.square_error_cost(
-            input=y_predict, label=y, main_program=program)
+        with program_guard(program, startup_program=Program()):
+            x = layers.data(name='x', shape=[13], dtype='float32')
+            y_predict = layers.fc(input=x, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+            program.append_backward(avg_cost)
 
-        avg_cost = layers.mean(x=cost, main_program=program)
-        self.assertIsNotNone(avg_cost)
-        program.append_backward(avg_cost)
-        print str(program)
+        print(str(program))
 
     def test_recognize_digits_mlp(self):
         program = Program()
-
-        # Change g_program, so the rest layers use `g_program`
-        images = layers.data(
-            name='pixel',
-            shape=[784],
-            data_type='float32',
-            main_program=program)
-        label = layers.data(
-            name='label', shape=[1], data_type='int32', main_program=program)
-        hidden1 = layers.fc(input=images,
-                            size=128,
-                            act='relu',
-                            main_program=program)
-        hidden2 = layers.fc(input=hidden1,
-                            size=64,
-                            act='relu',
-                            main_program=program)
-        predict = layers.fc(input=hidden2,
-                            size=10,
-                            act='softmax',
-                            main_program=program)
-        cost = layers.cross_entropy(
-            input=predict, label=label, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
-        self.assertIsNotNone(avg_cost)
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            # Change g_program, so the rest layers use `g_program`
+            images = layers.data(name='pixel', shape=[784], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            hidden1 = layers.fc(input=images, size=128, act='relu')
+            hidden2 = layers.fc(input=hidden1, size=64, act='relu')
+            predict = layers.fc(input=[hidden2, hidden1],
+                                size=10,
+                                act='softmax',
+                                param_attr=["sftmax.w1", "sftmax.w2"])
+            cost = layers.cross_entropy(input=predict, label=label)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+
+        print(str(program))
 
     def test_simple_conv2d(self):
         program = Program()
-        images = layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            data_type='int32',
-            main_program=program)
-        layers.conv2d(
-            input=images,
-            num_filters=3,
-            filter_size=[4, 4],
-            main_program=program)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(name='pixel', shape=[3, 48, 48], dtype='int32')
+            layers.conv2d(input=images, num_filters=3, filter_size=[4, 4])
 
-    def test_recognize_digits_conv(self):
+        print(str(program))
+
+    def test_conv2d_transpose(self):
         program = Program()
+        with program_guard(program):
+            img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
+            layers.conv2d_transpose(input=img, num_filters=10, output_size=28)
+        print(str(program))
 
-        images = layers.data(
-            name='pixel',
-            shape=[1, 28, 28],
-            data_type='float32',
-            main_program=program)
-        label = layers.data(
-            name='label', shape=[1], data_type='int32', main_program=program)
-        conv_pool_1 = nets.simple_img_conv_pool(
-            input=images,
-            filter_size=5,
-            num_filters=2,
-            pool_size=2,
-            pool_stride=2,
-            act="relu",
-            main_program=program)
-        conv_pool_2 = nets.simple_img_conv_pool(
-            input=conv_pool_1,
-            filter_size=5,
-            num_filters=4,
-            pool_size=2,
-            pool_stride=2,
-            act="relu",
-            main_program=program)
-
-        predict = layers.fc(input=conv_pool_2,
-                            size=10,
-                            act="softmax",
-                            main_program=program)
-        cost = layers.cross_entropy(
-            input=predict, label=label, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
-
-        program.append_backward(avg_cost)
-
-        print str(program)
+    def test_recognize_digits_conv(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(
+                name='pixel', shape=[1, 28, 28], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            conv_pool_1 = nets.simple_img_conv_pool(
+                input=images,
+                filter_size=5,
+                num_filters=2,
+                pool_size=2,
+                pool_stride=2,
+                act="relu")
+            conv_pool_2 = nets.simple_img_conv_pool(
+                input=conv_pool_1,
+                filter_size=5,
+                num_filters=4,
+                pool_size=2,
+                pool_stride=2,
+                act="relu")
+
+            predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
+            cost = layers.cross_entropy(input=predict, label=label)
+            avg_cost = layers.mean(x=cost)
+
+            program.append_backward(avg_cost)
+
+        print(str(program))
 
     def test_word_embedding(self):
         program = Program()
-        dict_size = 10000
-        embed_size = 32
-        first_word = layers.data(
-            name='firstw', shape=[1], data_type='int64', main_program=program)
-        second_word = layers.data(
-            name='secondw', shape=[1], data_type='int64', main_program=program)
-        third_word = layers.data(
-            name='thirdw', shape=[1], data_type='int64', main_program=program)
-        forth_word = layers.data(
-            name='forthw', shape=[1], data_type='int64', main_program=program)
-        next_word = layers.data(
-            name='nextw', shape=[1], data_type='int64', main_program=program)
-
-        embed_first = layers.embedding(
-            input=first_word,
-            size=[dict_size, embed_size],
-            data_type='float32',
-            param_attr={'name': 'shared_w'},
-            main_program=program)
-        embed_second = layers.embedding(
-            input=second_word,
-            size=[dict_size, embed_size],
-            data_type='float32',
-            param_attr={'name': 'shared_w'},
-            main_program=program)
-
-        embed_third = layers.embedding(
-            input=third_word,
-            size=[dict_size, embed_size],
-            data_type='float32',
-            param_attr={'name': 'shared_w'},
-            main_program=program)
-        embed_forth = layers.embedding(
-            input=forth_word,
-            size=[dict_size, embed_size],
-            data_type='float32',
-            param_attr={'name': 'shared_w'},
-            main_program=program)
-
-        concat_embed = layers.concat(
-            input=[embed_first, embed_second, embed_third, embed_forth],
-            axis=1,
-            main_program=program)
-
-        hidden1 = layers.fc(input=concat_embed,
-                            size=256,
-                            act='sigmoid',
-                            main_program=program)
-        predict_word = layers.fc(input=hidden1,
-                                 size=dict_size,
-                                 act='softmax',
-                                 main_program=program)
-        cost = layers.cross_entropy(
-            input=predict_word, label=next_word, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
-        self.assertIsNotNone(avg_cost)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            dict_size = 10000
+            embed_size = 32
+            first_word = layers.data(name='firstw', shape=[1], dtype='int64')
+            second_word = layers.data(name='secondw', shape=[1], dtype='int64')
+            third_word = layers.data(name='thirdw', shape=[1], dtype='int64')
+            forth_word = layers.data(name='forthw', shape=[1], dtype='int64')
+            next_word = layers.data(name='nextw', shape=[1], dtype='int64')
+
+            embed_first = layers.embedding(
+                input=first_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+            embed_second = layers.embedding(
+                input=second_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+
+            embed_third = layers.embedding(
+                input=third_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+            embed_forth = layers.embedding(
+                input=forth_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+
+            concat_embed = layers.concat(
+                input=[embed_first, embed_second, embed_third, embed_forth],
+                axis=1)
+
+            hidden1 = layers.fc(input=concat_embed, size=256, act='sigmoid')
+            predict_word = layers.fc(input=hidden1,
+                                     size=dict_size,
+                                     act='softmax')
+            cost = layers.cross_entropy(input=predict_word, label=next_word)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+
+        print(str(program))
+
+    def test_linear_chain_crf(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            label_dict_len = 10
+            images = layers.data(name='pixel', shape=[784], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            hidden = layers.fc(input=images, size=128)
+            crf = layers.linear_chain_crf(
+                input=hidden, label=label, param_attr=ParamAttr(name="crfw"))
+            crf_decode = layers.crf_decoding(
+                input=hidden, param_attr=ParamAttr(name="crfw"))
+            layers.chunk_eval(
+                input=crf_decode,
+                label=label,
+                chunk_scheme="IOB",
+                num_chunk_types=(label_dict_len - 1) / 2)
+            self.assertNotEqual(crf, None)
+            self.assertNotEqual(crf_decode, None)
+
+        print(str(program))
+
+    def test_sigmoid_cross_entropy(self):
+        program = Program()
+        with program_guard(program):
+            dat = layers.data(name='data', shape=[10], dtype='float32')
+            lbl = layers.data(name='label', shape=[10], dtype='float32')
+            self.assertIsNotNone(
+                layers.sigmoid_cross_entropy_with_logits(
+                    x=dat, label=lbl))
+        print(str(program))
+
+    def test_sequence_expand(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10], dtype='float32')
+            y = layers.data(
+                name='y', shape=[10, 20], dtype='float32', lod_level=1)
+            self.assertIsNotNone(layers.sequence_expand(x=x, y=y))
+        print(str(program))
+
+    def test_lstm_unit(self):
+        program = Program()
+        with program_guard(program):
+            x_t_data = layers.data(
+                name='x_t_data', shape=[10, 10], dtype='float32')
+            x_t = layers.fc(input=x_t_data, size=10)
+            prev_hidden_data = layers.data(
+                name='prev_hidden_data', shape=[10, 30], dtype='float32')
+            prev_hidden = layers.fc(input=prev_hidden_data, size=30)
+            prev_cell_data = layers.data(
+                name='prev_cell', shape=[10, 30], dtype='float32')
+            prev_cell = layers.fc(input=prev_cell_data, size=30)
+            self.assertIsNotNone(
+                layers.lstm_unit(
+                    x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell))
+        print(str(program))
+
+    def test_sequence_softmax(self):
+        program = Program()
+        with program_guard(program):
+            seq_data = layers.data(
+                name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
+            seq = layers.fc(input=seq_data, size=20)
+            self.assertIsNotNone(layers.sequence_softmax(x=seq))
+        print(str(program))
+
+    def test_get_places(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.get_places(device_count=4)
+            self.assertIsNotNone(x)
+        print(str(program))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py b/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
index 6f06a66c825b37ee91214efc0a29a58f0b9057f9..cd917dff7f26a04cd289470b3191ad08d7935169 100644
--- a/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
+++ b/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import random
 import numpy as np
@@ -104,7 +117,7 @@ class TestLinearChainCrfOp(OpTest):
         transition_exps = np.exp(transition)
 
         labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int32")
+            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
 
         self.inputs = {
             "Emission": (emission, lod),
diff --git a/python/paddle/v2/fluid/tests/test_lod_array_length_op.py b/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
index a01ae83772185df218b8c453557dc0cac719673b..f80136cb0d8bf5ae87cfa140d0b8aa895d38568d 100644
--- a/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
+++ b/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
@@ -13,7 +26,7 @@ class TestLoDArrayLength(unittest.TestCase):
         arr_len = layers.array_length(arr)
         cpu = core.CPUPlace()
         exe = Executor(cpu)
-        result = numpy.array(exe.run(fetch_list=[arr_len])[0])
+        result = exe.run(fetch_list=[arr_len])[0]
         self.assertEqual(11, result[0])
 
 
diff --git a/python/paddle/v2/fluid/tests/test_lod_rank_table.py b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
index bbc11930b9e804c2769cc590c298c6e90dc36ca6..673605d79c72ef2d1251659afeee458ee0d0ac91 100644
--- a/python/paddle/v2/fluid/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
@@ -1,6 +1,18 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.v2.fluid.layers import lod_rank_table, data
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.framework import g_main_program
 import paddle.v2.fluid.core as core
 import numpy
 import unittest
@@ -18,7 +30,7 @@ class TestLoDRankTable(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(numpy.random.random(size=(17, 100)), cpu)
         tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])
-        exe.run(g_main_program, scope=scope, feed={'x': tensor})
+        exe.run(scope=scope, feed={'x': tensor})
         var = scope.find_var(rank_table.name)
         table = var.get_lod_rank_table()
         self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items())
diff --git a/python/paddle/v2/fluid/tests/test_lod_reset_op.py b/python/paddle/v2/fluid/tests/test_lod_reset_op.py
index 652ccecfa443fc95f08f52df766709cb550f4049..d799dbfa217bc6012e1225216882a95fc0998544 100644
--- a/python/paddle/v2/fluid/tests/test_lod_reset_op.py
+++ b/python/paddle/v2/fluid/tests/test_lod_reset_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_lod_tensor_array.py b/python/paddle/v2/fluid/tests/test_lod_tensor_array.py
index d6d3e23fd8898a62528d63795d1bff1b72752477..c593b1e06132036ecdf89ef47b1074198c346cc6 100644
--- a/python/paddle/v2/fluid/tests/test_lod_tensor_array.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import paddle.v2.fluid.core as core
 import numpy
diff --git a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
index b18cb6b49fa41f26e1b6de1128690507c5a2f099..5887f9799a11106656306852a7ba4f7b2ef9ebd4 100644
--- a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
@@ -1,10 +1,23 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import paddle.v2.fluid.core as core
 import numpy
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
@@ -18,7 +31,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor.set_lod([[0, 3, 9, 10]])
         expect = map(lambda x: numpy.array(x).astype('int32'),
                      [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
-        self.main(tensor=tensor, expect_array=expect, expect_lod=[] * 6)
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=[] * 6,
+            expect_max_len=6)
 
     def test_lod_tensor_to_array_level_0_empty_seq(self):
         tensor = core.LoDTensor()
@@ -27,7 +44,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor.set_lod([[0, 3, 9, 9, 10]])
         expect = map(lambda x: numpy.array(x).astype('int32'),
                      [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
-        self.main(tensor=tensor, expect_array=expect, expect_lod=[] * 6)
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=[] * 6,
+            expect_max_len=6)
 
     def test_lod_tensor_to_array_level_1(self):
         tensor = core.LoDTensor()
@@ -44,7 +65,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         ]
 
         lod = [[[0, 2, 5]], [[0, 6, 12]], [[0, 3]]]
-        self.main(tensor=tensor, expect_array=expect, expect_lod=lod)
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=lod,
+            expect_max_len=3)
 
     def test_lod_tensor_to_array_level_1_empty_seq(self):
         tensor = core.LoDTensor()
@@ -63,7 +88,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         ]
 
         lod = [[[0, 5, 8, 8, 15]], [[0, 2, 6, 7, 8]], [[0, 2, 6]], [[0, 2]]]
-        self.main(tensor=tensor, expect_array=expect, expect_lod=lod)
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=lod,
+            expect_max_len=4)
 
     def test_lod_tensor_to_array_level_2(self):
         tensor = core.LoDTensor()
@@ -80,7 +109,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         ]
         lod = [[[0, 1, 3, 4], [0, 1, 4, 8, 12]],
                [[0, 4, 7], [0, 1, 5, 9, 17, 21, 27, 31]], [[0, 2], [0, 6, 7]]]
-        self.main(tensor=tensor, expect_array=expect, expect_lod=lod)
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=lod,
+            expect_max_len=3)
 
     def test_lod_tensor_to_array_level_2_skip_level(self):
         tensor = core.LoDTensor()
@@ -88,19 +121,27 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
         tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13],
                         [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]])
-        self.main(tensor=tensor, expect_array=None, expect_lod=None, level=1)
-
-    def main(self, tensor, expect_array, expect_lod, level=0):
+        self.main(
+            tensor=tensor,
+            expect_array=None,
+            expect_lod=None,
+            expect_max_len=4,
+            level=1)
+
+    def main(self, tensor, expect_array, expect_lod, expect_max_len, level=0):
         place = self.place()
         program = Program()
-        x = layers.data(name='x', shape=[10], main_program=program)
-        x.persistable = True
-        table = layers.lod_rank_table(x, level=level, main_program=program)
-        array = layers.lod_tensor_to_array(x, table, main_program=program)
-        array.persistable = True
-
-        result = layers.array_to_lod_tensor(array, table, main_program=program)
-        result.persistable = True
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10])
+            x.persistable = True
+            table = layers.lod_rank_table(x, level=level)
+            max_len = layers.max_sequence_len(table)
+            max_len.persistable = True
+            array = layers.lod_tensor_to_array(x, table)
+            array.persistable = True
+
+            result = layers.array_to_lod_tensor(array, table)
+            result.persistable = True
         exe = Executor(place)
         scope = core.Scope()
         exe.run(program, feed={'x': tensor}, scope=scope)
@@ -110,6 +151,10 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             self.check_array_same(array, expect_array, expect_lod)
         self.check_tensor_same(scope.find_var(result.name).get_tensor(), tensor)
 
+        self.assertEqual(
+            numpy.array(scope.find_var(max_len.name).get_tensor())[0],
+            expect_max_len)
+
     def check_array_same(self, array, expect_tensor, expect_lod):
         self.assertEqual(len(expect_tensor), len(array))
         for i, exp in enumerate(zip(expect_tensor, expect_lod)):
@@ -129,19 +174,16 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
         place = core.CPUPlace()
         program = Program()
 
-        x = layers.data(
-            name='x',
-            shape=[1],
-            data_type='float32',
-            main_program=program,
-            stop_gradient=False)
-        table = layers.lod_rank_table(x, level=0, main_program=program)
-        array = layers.lod_tensor_to_array(x, table, main_program=program)
-        result = layers.array_to_lod_tensor(array, table, main_program=program)
+        with program_guard(program):
+            x = layers.data(
+                name='x', shape=[1], dtype='float32', stop_gradient=False)
+            table = layers.lod_rank_table(x, level=0)
+            array = layers.lod_tensor_to_array(x, table)
+            result = layers.array_to_lod_tensor(array, table)
 
-        mean = layers.mean(x=result, main_program=program)
+            mean = layers.mean(x=result)
 
-        append_backward_ops(mean)
+            append_backward(mean)
 
         tensor = core.LoDTensor()
         tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place)
@@ -151,10 +193,11 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
 
         exe = Executor(place)
         g_out = [
-            item.sum()
-            for item in map(
-                numpy.array,
-                exe.run(program, feed={'x': tensor}, fetch_list=[g_vars]))
+            numpy.array(item).sum()
+            for item in exe.run(program,
+                                feed={'x': tensor},
+                                fetch_list=[g_vars],
+                                return_numpy=False)
         ]
         g_out_sum = numpy.array(g_out).sum()
 
diff --git a/python/paddle/v2/fluid/tests/test_log_loss_op.py b/python/paddle/v2/fluid/tests/test_log_loss_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..fde99bfaa16e6d260a4eefc295dbc765c3816861
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_log_loss_op.py
@@ -0,0 +1,46 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLogLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'log_loss'
+        samples_num = 32
+
+        predicted = np.random.uniform(0.1, 1.0,
+                                      (samples_num, 1)).astype("float32")
+        labels = np.random.randint(0, 2, (samples_num, 1)).astype("float32")
+        epsilon = 1e-4
+        self.inputs = {
+            'Predicted': predicted,
+            'Labels': labels,
+        }
+
+        self.attrs = {'epsilon': epsilon}
+        loss = -labels * np.log(predicted + epsilon) - (
+            1 - labels) * np.log(1 - predicted + epsilon)
+        self.outputs = {'Loss': loss}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Predicted'], 'Loss', max_relative_error=0.03)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_logical_op.py b/python/paddle/v2/fluid/tests/test_logical_op.py
index ac90bf839cb96053387bb82c112692136707744c..8c9e8de739f91a94352f1d9cfa5ce225be929b42 100644
--- a/python/paddle/v2/fluid/tests/test_logical_op.py
+++ b/python/paddle/v2/fluid/tests/test_logical_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import op_test
 import unittest
 import numpy as np
diff --git a/python/paddle/v2/fluid/tests/test_lookup_table_op.py b/python/paddle/v2/fluid/tests/test_lookup_table_op.py
index a56a549e69eaf950df39853a63947a8abac930d7..1ff6b305bc9269af6af5377801b90b2839a7c079 100644
--- a/python/paddle/v2/fluid/tests/test_lookup_table_op.py
+++ b/python/paddle/v2/fluid/tests/test_lookup_table_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_lrn_op.py b/python/paddle/v2/fluid/tests/test_lrn_op.py
index 7e34b3c91c16c440f12c51415c509400e1f315dc..051704617e733823e07db7a96d9dff5a8bb95afa 100644
--- a/python/paddle/v2/fluid/tests/test_lrn_op.py
+++ b/python/paddle/v2/fluid/tests/test_lrn_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -23,7 +36,7 @@ class TestLRNOp(OpTest):
         start = -(self.n - 1) / 2
         end = start + self.n
 
-        mid = np.empty((self.N, self.C, self.H, self.W), dtype=float)
+        mid = np.empty((self.N, self.C, self.H, self.W)).astype("float32")
         mid.fill(self.k)
         for m in range(0, self.N):
             for i in range(0, self.C):
@@ -74,5 +87,4 @@ class TestLRNOp(OpTest):
 
 
 if __name__ == "__main__":
-    exit(0)  # LRN grad implement wrong
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_lstm_op.py b/python/paddle/v2/fluid/tests/test_lstm_op.py
index 77f062e8c8870ec9cc56c9566108abe74665ae30..76ea8def7cbe66f81b2c548fc2cc9fb9a46c77eb 100644
--- a/python/paddle/v2/fluid/tests/test_lstm_op.py
+++ b/python/paddle/v2/fluid/tests/test_lstm_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_lstm_unit_op.py b/python/paddle/v2/fluid/tests/test_lstm_unit_op.py
index 6bad2e1f7c34c51419424d88b41b809da997eb8f..c97c1e72aaa1e83f89e24ad5f72d022edef5f7b8 100644
--- a/python/paddle/v2/fluid/tests/test_lstm_unit_op.py
+++ b/python/paddle/v2/fluid/tests/test_lstm_unit_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_margin_rank_loss_op.py b/python/paddle/v2/fluid/tests/test_margin_rank_loss_op.py
index 63378cbc4ec95d7d3c49a92f750b55a8dbc22414..3d8c1d19f90ed4e5945646b85cebb20bb5053f53 100644
--- a/python/paddle/v2/fluid/tests/test_margin_rank_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_margin_rank_loss_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_matmul_op.py b/python/paddle/v2/fluid/tests/test_matmul_op.py
index d51572c8ab7c44fa0c6e83e50b56f05780530c61..0220cfe1287ab6908f379f9f7451cd257dd7f0a6 100644
--- a/python/paddle/v2/fluid/tests/test_matmul_op.py
+++ b/python/paddle/v2/fluid/tests/test_matmul_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_maxout_op.py b/python/paddle/v2/fluid/tests/test_maxout_op.py
index 05e42f315833cab5bc5272cbd2173ea8012ff7f5..ed8c0d2b67199fb880b4f314a2817a128bf1ad3a 100644
--- a/python/paddle/v2/fluid/tests/test_maxout_op.py
+++ b/python/paddle/v2/fluid/tests/test_maxout_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -30,9 +43,7 @@ class TestMaxOutOp(OpTest):
     def init_test_case(self):
         self.MaxOut_forward_naive = maxout_forward_naive
         self.shape = [100, 6, 2, 2]
-        self.groups=2
-
-
+        self.groups = 2
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_mean_op.py b/python/paddle/v2/fluid/tests/test_mean_op.py
index 7823abd8f813aad6462c98a9ace9a13dc286a157..f9d7d6921e45d10ab2e22c0764c129a8a473726a 100644
--- a/python/paddle/v2/fluid/tests/test_mean_op.py
+++ b/python/paddle/v2/fluid/tests/test_mean_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_memory_optimization_transpiler.py b/python/paddle/v2/fluid/tests/test_memory_optimization_transpiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..76f3c4eb644019b122a93fc12512c3bd39a71bbe
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_memory_optimization_transpiler.py
@@ -0,0 +1,46 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import print_function
+import unittest
+
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.optimizer as optimizer
+from paddle.v2.fluid.framework import Program, program_guard
+from paddle.v2.fluid.memory_optimization_transpiler import memory_optimize
+
+
+class TestControlFlowGraph(unittest.TestCase):
+    def setUp(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            x = layers.data(name='x', shape=[13], dtype='float32')
+            y_predict = layers.fc(input=x, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(x=cost)
+            opt = optimizer.SGD(learning_rate=0.001)
+            opt = opt.minimize(avg_cost)
+
+        self.program = program
+
+    def test_control_flow_graph(self):
+        print("before optimization")
+        print(str(self.program))
+        result_program = memory_optimize(self.program)
+        print("after optimization")
+        print(str(result_program))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_minus_op.py b/python/paddle/v2/fluid/tests/test_minus_op.py
index c56d7cb548706880dd482bad750f2989c0e9a710..99c0d9056a7172ea40b7db943800a86d4cfbdb46 100644
--- a/python/paddle/v2/fluid/tests/test_minus_op.py
+++ b/python/paddle/v2/fluid/tests/test_minus_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
index 8af99005dc0b5d50de60ca89c2ddf870b1537edb..18e3991b94843cda7316b8e23be02475bb669653 100644
--- a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
@@ -1,5 +1,18 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard, default_main_program, default_startup_program
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.optimizer import MomentumOptimizer
 import paddle.v2.fluid.core as core
@@ -10,45 +23,42 @@ import numpy as np
 
 class TestMNISTIfElseOp(unittest.TestCase):
     def test_raw_api(self):
-        kwargs = {'startup_program': Program(), 'main_program': Program()}
-        image = layers.data(
-            name='x', shape=[784], data_type='float32', **kwargs)
+        prog = Program()
+        startup_prog = Program()
+        with program_guard(prog, startup_prog):
+            image = layers.data(name='x', shape=[784], dtype='float32')
 
-        label = layers.data(name='y', shape=[1], data_type='int64', **kwargs)
+            label = layers.data(name='y', shape=[1], dtype='int64')
 
-        limit = layers.fill_constant_batch_size_like(
-            input=label, dtype='int64', shape=[1], value=5.0, **kwargs)
+            limit = layers.fill_constant_batch_size_like(
+                input=label, dtype='int64', shape=[1], value=5.0)
+            cond = layers.less_than(x=label, y=limit)
+            true_image, false_image = layers.split_lod_tensor(
+                input=image, mask=cond)
 
-        cond = layers.less_than(x=label, y=limit, **kwargs)
-        true_image, false_image = layers.split_lod_tensor(
-            input=image, mask=cond, **kwargs)
+            true_out = layers.create_tensor(dtype='float32')
+            true_cond = layers.ConditionalBlock([true_image])
 
-        true_out = layers.create_tensor(dtype='float32', **kwargs)
-        true_cond = layers.ConditionalBlock([true_image], **kwargs)
+            with true_cond.block():
+                hidden = layers.fc(input=true_image, size=100, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                layers.assign(input=prob, output=true_out)
 
-        with true_cond.block():
-            hidden = layers.fc(input=true_image, size=100, act='tanh', **kwargs)
-            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
-            layers.assign(input=prob, output=true_out, **kwargs)
+            false_out = layers.create_tensor(dtype='float32')
+            false_cond = layers.ConditionalBlock([false_image])
 
-        false_out = layers.create_tensor(dtype='float32', **kwargs)
-        false_cond = layers.ConditionalBlock([false_image], **kwargs)
+            with false_cond.block():
+                hidden = layers.fc(input=false_image, size=200, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                layers.assign(input=prob, output=false_out)
 
-        with false_cond.block():
-            hidden = layers.fc(input=false_image,
-                               size=200,
-                               act='tanh',
-                               **kwargs)
-            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
-            layers.assign(input=prob, output=false_out, **kwargs)
+            prob = layers.merge_lod_tensor(
+                in_true=true_out, in_false=false_out, mask=cond, x=image)
+            loss = layers.cross_entropy(input=prob, label=label)
+            avg_loss = layers.mean(x=loss)
 
-        prob = layers.merge_lod_tensor(
-            in_true=true_out, in_false=false_out, mask=cond, x=image, **kwargs)
-        loss = layers.cross_entropy(input=prob, label=label, **kwargs)
-        avg_loss = layers.mean(x=loss, **kwargs)
-
-        optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-        optimizer.minimize(avg_loss, kwargs['startup_program'])
+            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+            optimizer.minimize(avg_loss, startup_prog)
 
         train_reader = paddle.batch(
             paddle.reader.shuffle(
@@ -58,7 +68,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
         place = core.CPUPlace()
         exe = Executor(place)
 
-        exe.run(kwargs['startup_program'])
+        exe.run(startup_prog)
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
             for data in train_reader():
@@ -66,57 +76,46 @@ class TestMNISTIfElseOp(unittest.TestCase):
                 y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                 y_data = np.expand_dims(y_data, axis=1)
 
-                tensor_x = core.LoDTensor()
-                tensor_x.set(x_data, place)
-
-                tensor_y = core.LoDTensor()
-                tensor_y.set(y_data, place)
-
-                outs = map(np.array,
-                           exe.run(kwargs['main_program'],
-                                   feed={'x': tensor_x,
-                                         'y': tensor_y},
-                                   fetch_list=[avg_loss]))
+                outs = exe.run(prog,
+                               feed={'x': x_data,
+                                     'y': y_data},
+                               fetch_list=[avg_loss])
                 print outs[0]
                 if outs[0] < 1.0:
                     return
         self.assertFalse(True)
 
     def test_ifelse(self):
-        kwargs = {'startup_program': Program(), 'main_program': Program()}
-        image = layers.data(
-            name='x', shape=[784], data_type='float32', **kwargs)
-
-        label = layers.data(name='y', shape=[1], data_type='int64', **kwargs)
-
-        limit = layers.fill_constant_batch_size_like(
-            input=label, dtype='int64', shape=[1], value=5.0, **kwargs)
-
-        cond = layers.less_than(x=label, y=limit, **kwargs)
-
-        ie = layers.IfElse(cond, **kwargs)
-
-        with ie.true_block():
-            true_image = ie.input(image)
-            hidden = layers.fc(input=true_image, size=100, act='tanh', **kwargs)
-            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
-            ie.output(prob)
-
-        with ie.false_block():
-            false_image = ie.input(image)
-            hidden = layers.fc(input=false_image,
-                               size=200,
-                               act='tanh',
-                               **kwargs)
-            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
-            ie.output(prob)
-
-        prob = ie()
-        loss = layers.cross_entropy(input=prob[0], label=label, **kwargs)
-        avg_loss = layers.mean(x=loss, **kwargs)
-
-        optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-        optimizer.minimize(avg_loss, kwargs['startup_program'])
+        prog = Program()
+        startup_prog = Program()
+        with program_guard(prog, startup_prog):
+            image = layers.data(name='x', shape=[784], dtype='float32')
+
+            label = layers.data(name='y', shape=[1], dtype='int64')
+
+            limit = layers.fill_constant_batch_size_like(
+                input=label, dtype='int64', shape=[1], value=5.0)
+            cond = layers.less_than(x=label, y=limit)
+            ie = layers.IfElse(cond)
+
+            with ie.true_block():
+                true_image = ie.input(image)
+                hidden = layers.fc(input=true_image, size=100, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                ie.output(prob)
+
+            with ie.false_block():
+                false_image = ie.input(image)
+                hidden = layers.fc(input=false_image, size=200, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                ie.output(prob)
+
+            prob = ie()
+            loss = layers.cross_entropy(input=prob[0], label=label)
+            avg_loss = layers.mean(x=loss)
+
+            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+            optimizer.minimize(avg_loss, startup_prog)
         train_reader = paddle.batch(
             paddle.reader.shuffle(
                 paddle.dataset.mnist.train(), buf_size=8192),
@@ -131,19 +130,12 @@ class TestMNISTIfElseOp(unittest.TestCase):
             for data in train_reader():
                 x_data = np.array(map(lambda x: x[0], data)).astype("float32")
                 y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-                y_data = np.expand_dims(y_data, axis=1)
-
-                tensor_x = core.LoDTensor()
-                tensor_x.set(x_data, place)
-
-                tensor_y = core.LoDTensor()
-                tensor_y.set(y_data, place)
+                y_data = y_data.reshape((y_data.shape[0], 1))
 
-                outs = map(np.array,
-                           exe.run(kwargs['main_program'],
-                                   feed={'x': tensor_x,
-                                         'y': tensor_y},
-                                   fetch_list=[avg_loss]))
+                outs = exe.run(kwargs['main_program'],
+                               feed={'x': x_data,
+                                     'y': y_data},
+                               fetch_list=[avg_loss])
                 print outs[0]
                 if outs[0] < 1.0:
                     return
@@ -151,4 +143,5 @@ class TestMNISTIfElseOp(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    unittest.main()
+    # temp disable if else unittest since it could be buggy.
+    exit(0)
diff --git a/python/paddle/v2/fluid/tests/test_modified_huber_loss_op.py b/python/paddle/v2/fluid/tests/test_modified_huber_loss_op.py
index 33de8ff7219fafa1ddeb9ebd78d77ae4fa240c98..40955283e6a9bfe07613f1a49e1adbc53275436e 100644
--- a/python/paddle/v2/fluid/tests/test_modified_huber_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_modified_huber_loss_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_momentum_op.py b/python/paddle/v2/fluid/tests/test_momentum_op.py
index 638095f7564c8761151a7794f98f9ca797b0083b..8008a5586f13f97b5671c13bafe8bd8d98e5b3ea 100644
--- a/python/paddle/v2/fluid/tests/test_momentum_op.py
+++ b/python/paddle/v2/fluid/tests/test_momentum_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_mul_op.py b/python/paddle/v2/fluid/tests/test_mul_op.py
index 57d6d7e7e095cab2c3afb60d229fc09da98aed8b..3033b8ef70d2856805687fa7f09bf51fed01ff5a 100644
--- a/python/paddle/v2/fluid/tests/test_mul_op.py
+++ b/python/paddle/v2/fluid/tests/test_mul_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_multiplex_op.py b/python/paddle/v2/fluid/tests/test_multiplex_op.py
index 5937eb5aa4621556c9b8d59ea83a39d9738c7925..5746ab391e8b4201fc05b7556ad22bbb16fcd8ab 100644
--- a/python/paddle/v2/fluid/tests/test_multiplex_op.py
+++ b/python/paddle/v2/fluid/tests/test_multiplex_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_nccl_init_op.py b/python/paddle/v2/fluid/tests/test_nccl_init_op.py
deleted file mode 100644
index a536800ccd81fdc2f3b7c8320cede4f8ecf3a8cb..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/test_nccl_init_op.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import unittest, os
-import numpy as np
-import paddle.v2 as paddle
-from paddle.v2.fluid.op import Operator
-import paddle.v2.fluid.core as core
-from op_test import OpTest, create_op, set_input
-
-if not core.is_compile_gpu():
-    exit(0)
-
-gpu_count = core.get_cuda_device_count()
-
-if gpu_count <= 1:
-    exit(0)
-
-g_scope = core.Scope()
-g_ctx = core.DeviceContext.create(core.CPUPlace())
-
-
-class TestNCCLInit(unittest.TestCase):
-    def test_init(self):
-        self.op_type = "ncclInit"
-        self.gpus = range(gpu_count)
-
-        self.inputs = {}
-        self.attrs = {"gpus": self.gpus}
-        g_scope.var("Communicator").get_communicator()
-        self.outputs = {"Communicator": g_scope.find_var("Communicator")}
-        nccl_init = create_op(
-            g_scope,
-            op_type=self.op_type,
-            inputs=self.inputs,
-            outputs=self.outputs,
-            attrs=self.attrs)
-        nccl_init.run(g_scope, g_ctx)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_nce.py b/python/paddle/v2/fluid/tests/test_nce.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce66a7c6b355d9b97b7bee853025e5e9cb6e533b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_nce.py
@@ -0,0 +1,111 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def nce(input, weight, bias, sample_weight, labels, num_classes,
+        num_sample_class):
+    samples = []
+    sample_labels = []
+    batch_size = input.shape[0]
+    num_true_class = labels.shape[1]
+    for i in range(batch_size):
+        w = 1 if sample_weight is None else sample_weight[i]
+        for label in labels[i]:
+            samples.append((i, label, True, w))
+            sample_labels.append(label)
+        for num in range(num_sample_class):
+            samples.append((i, num, False, w))
+            sample_labels.append(num)
+    # forward bias
+    sample_out = np.zeros(len(samples)).astype(np.float32)
+    if bias is not None:
+        for i in range(len(samples)):
+            sample_out[i] = bias[samples[i][1]]
+    # forward weight
+    for i in range(len(samples)):
+        sample_out[i] += np.dot(input[samples[i][0]], weight[samples[i][1]])
+
+    # forward activation
+    sample_out = 1.0 / (1.0 + np.exp(-sample_out))
+    # forward cost
+    out = np.zeros(batch_size).astype(np.float32)
+    b = 1.0 / num_classes * num_sample_class
+    for i in range(len(samples)):
+        o = sample_out[i]
+        cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b))
+        out[samples[i][0]] += cost * samples[i][3]
+    return (out[:, np.newaxis], np.array(sample_out).reshape(
+        batch_size, num_sample_class + num_true_class),
+            np.array(sample_labels).reshape(batch_size,
+                                            num_sample_class + num_true_class))
+
+
+class TestNCE(OpTest):
+    def generate_data(self, dim, batch_size, num_classes, num_true_class,
+                      num_neg_samples):
+        input = np.random.randn(batch_size, dim).astype(np.float32)
+        weight = np.random.randn(num_classes, dim).astype(np.float32)
+        bias = np.random.randn(num_classes).astype(np.float32)
+        sample_weight = np.random.randn(batch_size).astype(np.float32)
+        labels = np.random.randint(0, num_classes, (batch_size, num_true_class))
+        self.attrs = {
+            'num_total_classes': num_classes,
+            'num_neg_samples': num_neg_samples,
+            'custom_neg_classes': range(num_neg_samples)
+        }
+        self.inputs = {
+            'Input': input,
+            'Label': labels,
+            'Weight': weight,
+            'Bias': bias,
+            'SampleWeight': sample_weight
+        }
+
+    def set_data(self):
+        self.generate_data(5, 5, 4, 1, 2)
+
+    def compute(self):
+        out = nce(self.inputs['Input'], self.inputs['Weight'],
+                  self.inputs['Bias'], self.inputs['SampleWeight'],
+                  self.inputs['Label'], self.attrs['num_total_classes'],
+                  self.attrs['num_neg_samples'])
+        self.outputs = {
+            'Cost': out[0],
+            'SampleLogits': out[1],
+            'SampleLabels': out[2]
+        }
+
+    def setUp(self):
+        self.op_type = 'nce'
+        self.set_data()
+        self.compute()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["Input", "Weight", "Bias"], "Cost", max_relative_error=0.02)
+
+
+class TestNCECase1(TestNCE):
+    def set_data(self):
+        self.generate_data(10, 20, 10, 2, 5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_net.py b/python/paddle/v2/fluid/tests/test_net.py
index 318df08a9e73ac95cab73c34182bc6220ef6c681..cc78cb4a56de8256f6c5cb41584d875f0946cd12 100644
--- a/python/paddle/v2/fluid/tests/test_net.py
+++ b/python/paddle/v2/fluid/tests/test_net.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.op import Operator
 import unittest
@@ -7,7 +20,7 @@ def fc(X, W, Y):
     ret_v = core.Net.create()
 
     ret_v.append_op(Operator("mul", X="X", Y="W", Out="pre_activation"))
-    ret_v.append_op(Operator("sigmoid", X="pre_activation", Y=Y))
+    ret_v.append_op(Operator("sigmoid", X="pre_activation", Out=Y))
     ret_v.complete_add_op(True)
     return ret_v
 
@@ -30,7 +43,7 @@ Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}
     Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
         Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
             Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
-            Op(sigmoid), inputs:{X[pre_activation]}, outputs:{Y[fc.out]}.
+            Op(sigmoid), inputs:{X[pre_activation]}, outputs:{Out[fc.out]}.
 '''
         self.assertEqual(expected, "\n" + str(net))
 
diff --git a/python/paddle/v2/fluid/tests/test_norm_op.py b/python/paddle/v2/fluid/tests/test_norm_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b053522d72bfb95b1d3da960482857f0bf6f6f8d
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_norm_op.py
@@ -0,0 +1,70 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def norm(input, scale, epsilon):
+    s0, s1, s2, s3 = input.shape
+    x_square = input * input
+    for i in xrange(s0):
+        input_batch = input[i:i + 1, :, :, :]
+        input_batch = input_batch.reshape(s1, s2 * s3)
+        x_square_batch = x_square[i:i + 1, :, :, :]
+        x_square_batch = x_square_batch.reshape(s1, s2 * s3)
+        square_colsum = x_square_batch.sum(axis=0) + epsilon
+        tmp = pow(square_colsum, 0.5)
+        tmp = np.reciprocal(tmp)
+        tmp_tile = np.tile(tmp, s1)
+        tmp_tile = tmp_tile.reshape(s1, s2 * s3)
+        scale_tile = np.tile(scale, (1, s2 * s3))
+        scale_tile = scale_tile.reshape(s1, s2 * s3)
+        out_batch = input_batch * tmp_tile * scale_tile
+        out_batch = out_batch.reshape(1, s1, s2, s3)
+        if i == 0:
+            out = out_batch
+        else:
+            out = np.concatenate((out, out_batch), 0)
+    out.reshape(s0, s1, s2, s3)
+    return out
+
+
+class TestNormOp(OpTest):
+    def setUp(self):
+        self.op_type = "norm"
+        self.init_test_case()
+        input = np.random.random(self.shape).astype("float32")
+        scale = np.array([10, 10, 10])
+        self.inputs = {
+            'X': input.astype('float32'),
+            'Scale': scale.astype('float32')
+        }
+        self.attrs = {'epsilon': self.epsilon}
+        output = norm(input, scale, self.epsilon)
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def init_test_case(self):
+        self.shape = [2, 3, 2, 2]
+        self.epsilon = 1e-6
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_op_support_gpu.py b/python/paddle/v2/fluid/tests/test_op_support_gpu.py
index a0eb4bd5fd2cc178ffe0763efdee61524ad6d4bd..741686a87465853b26311944e97cd88738d070fd 100644
--- a/python/paddle/v2/fluid/tests/test_op_support_gpu.py
+++ b/python/paddle/v2/fluid/tests/test_op_support_gpu.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import paddle.v2.fluid.core as core
 
diff --git a/python/paddle/v2/fluid/tests/test_operator.py b/python/paddle/v2/fluid/tests/test_operator.py
index 4aa022ef90159cd96eed4e4dbe30cf5d1e8a41a7..e75ee41149c6cf4479cd62620647198bc738406d 100644
--- a/python/paddle/v2/fluid/tests/test_operator.py
+++ b/python/paddle/v2/fluid/tests/test_operator.py
@@ -1,6 +1,19 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
+
 import paddle.v2.fluid.op as op
-import paddle.v2.fluid.core as core
 import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 
 
diff --git a/python/paddle/v2/fluid/tests/test_operator_desc.py b/python/paddle/v2/fluid/tests/test_operator_desc.py
index e8362d2e9c6038c04c24dce35de8c53bfde78142..ed18fafe339271ce61b891966162bfc0d4a8c48d 100644
--- a/python/paddle/v2/fluid/tests/test_operator_desc.py
+++ b/python/paddle/v2/fluid/tests/test_operator_desc.py
@@ -1,11 +1,28 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
-from paddle.v2.fluid.framework import Variable, Program, g_main_program
+
 import paddle.v2.fluid.core as core
 
+from paddle.v2.fluid.framework import Program, default_startup_program
+
+main_program = default_startup_program()
+
 
 class TestOperator(unittest.TestCase):
     def test_error_type(self):
-        block = g_main_program.create_block()
+        block = main_program.create_block()
         try:
             block.append_op()
             self.assertFail()
diff --git a/python/paddle/v2/fluid/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py
index 2459dfd664300d405edb36c4ca906c1769b5e7d2..dbec3a59441dd1aa2e87296d9e4edd7e1a0f1306 100644
--- a/python/paddle/v2/fluid/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
@@ -1,8 +1,21 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.optimizer as optimizer
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 
 
 class TestOptimizer(unittest.TestCase):
@@ -27,7 +40,7 @@ class TestOptimizer(unittest.TestCase):
         block.append_op(
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
-        opts = sgd_optimizer.minimize(mean_out, init_program)
+        opts, _ = sgd_optimizer.minimize(mean_out, init_program)
         self.assertEqual(len(opts), 1)
         sgd_op = opts[0]
         self.assertEqual(sgd_op.type, "sgd")
@@ -57,7 +70,7 @@ class TestOptimizer(unittest.TestCase):
         learning_rate = 0.01
         sgd_optimizer = optimizer.SGDOptimizer(
             learning_rate=learning_rate, global_step=global_step)
-        opts = sgd_optimizer.minimize(mean_out, init_program)
+        opts, _ = sgd_optimizer.minimize(mean_out, init_program)
         self.assertEqual(len(opts), 2)
         sgd_op = opts[0]
         self.assertEqual(sgd_op.type, "sgd")
@@ -102,7 +115,7 @@ class TestMomentumOptimizer(unittest.TestCase):
             dtype="float32", shape=[1], lod_level=0, name="mean.out")
         block.append_op(
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
@@ -151,7 +164,7 @@ class TestMomentumOptimizer(unittest.TestCase):
         learning_rate = 0.01
         momentum_optimizer = self.MockMomentum(
             learning_rate=learning_rate, momentum=0.2, use_nesterov=True)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
@@ -209,7 +222,7 @@ class TestAdagradOptimizer(unittest.TestCase):
         learning_rate = 0.01
         adagrad_optimizer = self.MockAdagrad(
             learning_rate=learning_rate, epsilon=1.0e-6)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
         opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -269,7 +282,7 @@ class TestAdamOptimizer(unittest.TestCase):
         learning_rate = 0.01
         adam_optimizer = self.MockAdam(
             learning_rate=learning_rate, beta1=0.9, beta2=0.999)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
         opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -331,7 +344,7 @@ class TestAdamaxOptimizer(unittest.TestCase):
         learning_rate = 0.01
         adamax_optimizer = self.MockAdamax(
             learning_rate=learning_rate, beta1=0.9, beta2=0.999)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
         opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -390,7 +403,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
         learning_rate = 0.01
         decayed_adagrad_optimizer = self.MockDecayedAdagrad(
             learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
         opts = decayed_adagrad_optimizer.create_optimization_pass(
diff --git a/python/paddle/v2/fluid/tests/test_pad_op.py b/python/paddle/v2/fluid/tests/test_pad_op.py
index 55f1774e5755c846f60a2f1df3e705444a81192b..1036b6bcad307a6ce6b4092c84db7f494e8f5811 100644
--- a/python/paddle/v2/fluid/tests/test_pad_op.py
+++ b/python/paddle/v2/fluid/tests/test_pad_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_parallel_op.py b/python/paddle/v2/fluid/tests/test_parallel_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c190477d167c6aa48078869a9abb15488d21dd3
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_parallel_op.py
@@ -0,0 +1,174 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+
+import paddle.v2.fluid as fluid
+import numpy
+import sys
+# TODO(dzhwinter): get places op check need to be enhanced.
+sys.exit(0)
+
+
+class BaseParallelForTest(unittest.TestCase):
+    def run_test(self, callback, feed, fetch):
+        """
+        Run the unittest for parallel.for
+        Args:
+            callback(callable): A callable function returns a generator. There 
+                are two yields in the generator function. The first yield 
+                returns the data layers, and the second yield returns the loss. 
+                The modified data variables will be sent back during the first 
+                yield.
+
+            feed(dict): The executor feeding dictionary.
+            fetch(list|basestr): The fetch name lists. 
+
+        Returns:
+            None
+
+        Raises:
+            AssertionError when the computation of cpu, parallel.for in cpu, 
+                gpu, parallel.for in gpu are different.
+
+        """
+        cpu = fluid.CPUPlace()
+        result_cpu = self._run_test_impl_(
+            callback=callback,
+            feed=feed,
+            fetch=fetch,
+            place=cpu,
+            use_parallel=False)
+        result_cpu_parallel = self._run_test_impl_(
+            callback=callback,
+            feed=feed,
+            fetch=fetch,
+            place=cpu,
+            use_parallel=True)
+        if fluid.core.is_compile_gpu():
+            gpu = fluid.CUDAPlace(0)
+            result_gpu = self._run_test_impl_(
+                callback=callback,
+                feed=feed,
+                fetch=fetch,
+                place=gpu,
+                use_parallel=False)
+            result_gpu_parallel = self._run_test_impl_(
+                callback=callback,
+                feed=feed,
+                fetch=fetch,
+                place=gpu,
+                use_parallel=True)
+            self._assert_same_(fetch, result_cpu, result_cpu_parallel,
+                               result_gpu, result_gpu_parallel)
+        else:
+            self._assert_same_(fetch, result_cpu, result_cpu_parallel)
+
+    def _run_test_impl_(self, callback, feed, fetch, place, use_parallel=False):
+        """
+        Run a single test, returns the fetch values
+        Args:
+            place(Place): the computation place. 
+            use_parallel(bool): Whether use parallel.for or not. 
+
+        Returns:
+            Fetched numpy arrays.
+
+        """
+        if isinstance(fetch, basestring):
+            fetch = [fetch]
+        main = fluid.Program()
+        startup = fluid.Program()
+        # Fix seed
+        main.random_seed = 10
+        startup.random_seed = 10
+
+        with fluid.program_guard(main, startup):
+            generator = callback()
+            # Automatically insert parallel do if use_parallel = True
+            if use_parallel:
+                places = fluid.layers.get_places()
+                pd = fluid.layers.ParallelDo(places)
+                data = next(generator)
+
+                if isinstance(data, fluid.Variable):
+                    data = [data]
+
+                with pd.do():
+                    ins = map(pd.read_input, data)
+                    if len(ins) == 1:
+                        ins = ins[0]
+                    loss = generator.send(ins)  # patch input
+                    pd.write_output(loss)
+
+                loss = pd()
+            else:
+                data = next(generator)
+                loss = generator.send(data)
+            self.assertIsNotNone(loss)
+            avg_loss = fluid.layers.mean(x=loss)
+            fluid.backward.append_backward(loss=avg_loss)
+
+        exe = fluid.Executor(place)
+        exe.run(startup)
+        return exe.run(main, feed=feed, fetch_list=fetch)
+
+    def _assert_same_(self, fetch, *args):
+        """
+        Assert the return values of `run_test` are same.
+        Args:
+            fetch: Fetch list. Used for print error message
+            *args: The fetch result lists of each situations.
+
+        Returns:
+            None
+            
+        Raises:
+            AssertionError
+
+        """
+
+        def _impl_(a, b, fetch_id, item_id):
+            item_str = ['CPU', 'ParallelCPU', 'GPU', 'ParallelGPU']
+            flag = numpy.allclose(a, b, rtol=0.1)
+            self.assertTrue(flag, "The {0} are different in {1}".format(
+                fetch[fetch_id], item_str[item_id]))
+
+        for i, items in enumerate(zip(*args)):
+            self.assertGreater(len(items), 0)
+            for j in range(1, len(items)):
+                _impl_(items[0], items[j], fetch_id=i, item_id=j)
+
+
+class ParallelOpTest(BaseParallelForTest):
+    def test_simple_fc(self):
+        def __network__():
+            x = fluid.layers.data(shape=[784], dtype='float32', name='img')
+            # FIXME: This is a bug of parallel.do
+            x.stop_gradient = False
+            x = yield x
+            hidden = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
+            loss = fluid.layers.mean(x=hidden)
+            yield loss
+
+        self.run_test(
+            callback=__network__,
+            feed={
+                'img':
+                numpy.random.random(size=(128 * 3, 784)).astype('float32')
+            },
+            fetch='fc1.w@GRAD')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_parameter.py b/python/paddle/v2/fluid/tests/test_parameter.py
index a633d22c2b1db2728b6eb767078ce4aec6cce163..e0db318345061ad30178b69313b30f41c0c62164 100644
--- a/python/paddle/v2/fluid/tests/test_parameter.py
+++ b/python/paddle/v2/fluid/tests/test_parameter.py
@@ -1,17 +1,32 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import default_main_program
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.io as io
 from paddle.v2.fluid.initializer import ConstantInitializer
 import numpy as np
 
+main_program = default_main_program()
+
 
 class TestParameter(unittest.TestCase):
     def test_param(self):
         shape = [784, 100]
         val = 1.0625
-        b = g_main_program.global_block()
+        b = main_program.global_block()
         param = b.create_parameter(
             name='fc.w',
             shape=shape,
@@ -20,12 +35,12 @@ class TestParameter(unittest.TestCase):
         self.assertIsNotNone(param)
         self.assertEqual('fc.w', param.name)
         self.assertEqual((784, 100), param.shape)
-        self.assertEqual(core.DataType.FP32, param.data_type)
+        self.assertEqual(core.DataType.FP32, param.dtype)
         self.assertEqual(0, param.block.idx)
         exe = Executor(core.CPUPlace())
-        p = exe.run(g_main_program, fetch_list=[param])[0]
-        self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))
-        p = io.get_parameter_value_by_name('fc.w', exe, g_main_program)
+        p = exe.run(main_program, fetch_list=[param])[0]
+        self.assertTrue(np.allclose(p, np.ones(shape) * val))
+        p = io.get_parameter_value_by_name('fc.w', exe, main_program)
         self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))
 
 
diff --git a/python/paddle/v2/fluid/tests/test_pool2d_op.py b/python/paddle/v2/fluid/tests/test_pool2d_op.py
index 5dff6270f455395ce6ca8ae2428236f630467095..ac8b24e7ad57b536ba9db360a5499c36b783b133 100644
--- a/python/paddle/v2/fluid/tests/test_pool2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_pool2d_op.py
@@ -1,5 +1,20 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
+
+import paddle.v2.fluid.core as core
 from op_test import OpTest
 
 
@@ -44,6 +59,7 @@ def avg_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
 
 class TestPool2d_Op(OpTest):
     def setUp(self):
+        self.use_cudnn = False
         self.init_test_case()
         self.init_global_pool()
         self.init_op_type()
@@ -62,15 +78,25 @@ class TestPool2d_Op(OpTest):
             'ksize': self.ksize,
             'pooling_type': self.pool_type,
             'global_pooling': self.global_pool,
+            'use_cudnn': self.use_cudnn,
+            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
         }
 
         self.outputs = {'Out': output.astype('float32')}
 
     def test_check_output(self):
-        self.check_output()
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-5)
+        else:
+            self.check_output()
 
     def test_check_grad(self):
-        if self.pool_type != "max":
+        if self.use_cudnn and self.pool_type != "max":
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, set(['X']), 'Out', max_relative_error=0.07)
+        elif self.pool_type != "max":
             self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
 
     def init_test_case(self):
@@ -153,35 +179,41 @@ class TestCase5(TestCase2):
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
-#--------------------test pool2d_cudnn--------------------
-class TestCudnnCase1(TestPool2d_Op):
+#--------------------test pool2d--------------------
+class TestCUDNNCase1(TestPool2d_Op):
     def init_op_type(self):
-        self.op_type = "pool2d_cudnn"
+        self.use_cudnn = True
+        self.op_type = "pool2d"
 
 
-class TestCudnnCase2(TestCase1):
+class TestCUDNNCase2(TestCase1):
     def init_op_type(self):
-        self.op_type = "pool2d_cudnn"
+        self.use_cudnn = True
+        self.op_type = "pool2d"
 
 
-class TestCudnnCase3(TestCase2):
+class TestCUDNNCase3(TestCase2):
     def init_op_type(self):
-        self.op_type = "pool2d_cudnn"
+        self.use_cudnn = True
+        self.op_type = "pool2d"
 
 
-class TestCudnnCase4(TestCase3):
+class TestCUDNNCase4(TestCase3):
     def init_op_type(self):
-        self.op_type = "pool2d_cudnn"
+        self.use_cudnn = True
+        self.op_type = "pool2d"
 
 
-class TestCudnnCase5(TestCase4):
+class TestCUDNNCase5(TestCase4):
     def init_op_type(self):
-        self.op_type = "pool2d_cudnn"
+        self.use_cudnn = True
+        self.op_type = "pool2d"
 
 
-class TestCudnnCase6(TestCase5):
+class TestCUDNNCase6(TestCase5):
     def init_op_type(self):
-        self.op_type = "pool2d_cudnn"
+        self.use_cudnn = True
+        self.op_type = "pool2d"
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_pool3d_op.py b/python/paddle/v2/fluid/tests/test_pool3d_op.py
index 2ba86665a7d207e61159c02643fa40daca3be080..54b8df8465b44df953555bb1e4d299a48ffb5d66 100644
--- a/python/paddle/v2/fluid/tests/test_pool3d_op.py
+++ b/python/paddle/v2/fluid/tests/test_pool3d_op.py
@@ -1,5 +1,20 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
+
+import paddle.v2.fluid.core as core
 from op_test import OpTest
 
 
@@ -52,6 +67,7 @@ def avg_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
 
 class TestPool3d_Op(OpTest):
     def setUp(self):
+        self.use_cudnn = False
         self.init_test_case()
         self.init_global_pool()
         self.init_op_type()
@@ -71,15 +87,25 @@ class TestPool3d_Op(OpTest):
             'ksize': self.ksize,
             'pooling_type': self.pool_type,
             'global_pooling': self.global_pool,
+            'use_cudnn': self.use_cudnn,
+            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
         }
 
         self.outputs = {'Out': output.astype('float32')}
 
     def test_check_output(self):
-        self.check_output()
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-5)
+        else:
+            self.check_output()
 
     def test_check_grad(self):
-        if self.pool_type != "max":
+        if self.use_cudnn and self.pool_type != "max":
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, set(['X']), 'Out', max_relative_error=0.07)
+        elif self.pool_type != "max":
             self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
 
     def init_test_case(self):
@@ -163,35 +189,41 @@ class TestCase5(TestCase2):
         self.pool3D_forward_naive = max_pool3D_forward_naive
 
 
-#--------------------test pool3d_cudnn--------------------
-class TestCudnnCase1(TestPool3d_Op):
+#--------------------test pool3d--------------------
+class TestCUDNNCase1(TestPool3d_Op):
     def init_op_type(self):
-        self.op_type = "pool3d_cudnn"
+        self.use_cudnn = True
+        self.op_type = "pool3d"
 
 
-class TestCudnnCase2(TestCase1):
+class TestCUDNNCase2(TestCase1):
     def init_op_type(self):
-        self.op_type = "pool3d_cudnn"
+        self.use_cudnn = True
+        self.op_type = "pool3d"
 
 
-class TestCudnnCase3(TestCase2):
+class TestCUDNNCase3(TestCase2):
     def init_op_type(self):
-        self.op_type = "pool3d_cudnn"
+        self.use_cudnn = True
+        self.op_type = "pool3d"
 
 
-class TestCudnnCase4(TestCase3):
+class TestCUDNNCase4(TestCase3):
     def init_op_type(self):
-        self.op_type = "pool3d_cudnn"
+        self.use_cudnn = True
+        self.op_type = "pool3d"
 
 
-class TestCudnnCase5(TestCase4):
+class TestCUDNNCase5(TestCase4):
     def init_op_type(self):
-        self.op_type = "pool3d_cudnn"
+        self.use_cudnn = True
+        self.op_type = "pool3d"
 
 
-class TestCudnnCase6(TestCase5):
+class TestCUDNNCase6(TestCase5):
     def init_op_type(self):
-        self.op_type = "pool3d_cudnn"
+        self.use_cudnn = True
+        self.op_type = "pool3d"
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_pool_max_op.py b/python/paddle/v2/fluid/tests/test_pool_max_op.py
index 9d2d61c43868701392e90542f3b7fb2c4ea07548..c4ec0e50cc9d11d72faccd1a07b3f1843a9bcec3 100644
--- a/python/paddle/v2/fluid/tests/test_pool_max_op.py
+++ b/python/paddle/v2/fluid/tests/test_pool_max_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_positive_negative_pair_op.py b/python/paddle/v2/fluid/tests/test_positive_negative_pair_op.py
index f6a6c428a26dece01fe2958991edd3edf3a8266e..b75f7152efb0fbd56c87ff5b49d1a353aee5af25 100644
--- a/python/paddle/v2/fluid/tests/test_positive_negative_pair_op.py
+++ b/python/paddle/v2/fluid/tests/test_positive_negative_pair_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import itertools
 import numpy as np
diff --git a/python/paddle/v2/fluid/tests/test_precision_recall_op.py b/python/paddle/v2/fluid/tests/test_precision_recall_op.py
index d3dbdb6e2aba6dfe98440ad07083cf1ffda5b668..87c7fcb4b5f06e530959f53b8d247cced0196350 100644
--- a/python/paddle/v2/fluid/tests/test_precision_recall_op.py
+++ b/python/paddle/v2/fluid/tests/test_precision_recall_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_prelu_op.py b/python/paddle/v2/fluid/tests/test_prelu_op.py
index 7be932ac8f6b82283fecd32ac4b3b7bb9aff0338..38bd260bc92e2776b408a8c8b16ac905bb1de537 100644
--- a/python/paddle/v2/fluid/tests/test_prelu_op.py
+++ b/python/paddle/v2/fluid/tests/test_prelu_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_print_op.py b/python/paddle/v2/fluid/tests/test_print_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e42863af45353b29d54daf76c9ab3608b217298
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_print_op.py
@@ -0,0 +1,68 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.backward import append_backward
+from paddle.v2.fluid.framework import switch_main_program
+from paddle.v2.fluid.framework import Program
+import numpy as np
+
+
+class TestPrintOpCPU(unittest.TestCase):
+    def setUp(self):
+        self.place = core.CPUPlace()
+        self.x_tensor = core.LoDTensor()
+        tensor_np = np.random.random(size=(2, 3)).astype('float32')
+        self.x_tensor.set(tensor_np, self.place)
+        self.x_tensor.set_lod([[0, 1, 1]])
+
+    def build_network(self, only_forward, **kargs):
+        x = layers.data('x', shape=[3], dtype='float32', lod_level=1)
+        x.stop_gradient = False
+        printed = layers.Print(input=x, **kargs)
+        if only_forward: return printed
+        loss = layers.mean(x=printed)
+        append_backward(loss=loss)
+        return loss
+
+    def test_forward(self):
+        switch_main_program(Program())
+        printed = self.build_network(True, print_phase='forward')
+        exe = Executor(self.place)
+        outs = exe.run(feed={'x': self.x_tensor},
+                       fetch_list=[printed],
+                       return_numpy=False)
+
+    def test_backward(self):
+        switch_main_program(Program())
+        loss = self.build_network(False, print_phase='backward')
+        exe = Executor(self.place)
+        outs = exe.run(feed={'x': self.x_tensor},
+                       fetch_list=[loss],
+                       return_numpy=False)
+
+
+class TestPrintOpGPU(TestPrintOpCPU):
+    def setUp(self):
+        self.place = core.CUDAPlace(0)
+        self.x_tensor = core.LoDTensor()
+        tensor_np = np.random.random(size=(2, 3)).astype('float32')
+        self.x_tensor.set(tensor_np, self.place)
+        self.x_tensor.set_lod([[0, 1, 1]])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b439a16aa24fa41870b3e156247df4dbf5b5098
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
@@ -0,0 +1,44 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.profiler as profiler
+import paddle.v2.fluid.layers as layers
+import os
+
+
+class TestProfiler(unittest.TestCase):
+    def test_nvprof(self):
+        if not fluid.core.is_compile_gpu():
+            return
+        epoc = 8
+        dshape = [4, 3, 28, 28]
+        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        output_file = 'cuda_profiler.txt'
+        with profiler.cuda_profiler(output_file, 'csv') as nvprof:
+            for i in range(epoc):
+                input = np.random.random(dshape).astype('float32')
+                exe.run(fluid.default_main_program(), feed={'data': input})
+        os.remove(output_file)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_program.py b/python/paddle/v2/fluid/tests/test_program.py
index e9bcefd21569aaa9225c676ea03b5c8e37d00333..bcaeede93e43235d4c0288715dc19c7205e6a7c0 100644
--- a/python/paddle/v2/fluid/tests/test_program.py
+++ b/python/paddle/v2/fluid/tests/test_program.py
@@ -1,35 +1,51 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import print_function
 import unittest
 
-from paddle.v2.fluid.framework import Program
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import Program, default_main_program, program_guard, grad_var_name
+import paddle.v2.fluid.layers as layers
+
+main_program = default_main_program()
 
 
 class TestProgram(unittest.TestCase):
     def test_program(self):
-        b = g_main_program.current_block()
+        b = main_program.current_block()
         self.assertEqual(-1, b.parent_idx)
         self.assertEqual(0, b.idx)
 
-        b = g_main_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
-        b = g_main_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(2, b.idx)
         self.assertEqual(1, b.parent_idx)
 
-        g_main_program.rollback()
+        main_program.rollback()
 
-        b = g_main_program.current_block()
+        b = main_program.current_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
-        b = g_main_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(3, b.idx)
         self.assertEqual(1, b.parent_idx)
 
-        g_main_program.rollback()
-        b = g_main_program.current_block()
+        main_program.rollback()
+        b = main_program.current_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
@@ -48,8 +64,8 @@ class TestProgram(unittest.TestCase):
 
         # FIXME(yuyang18): We manual compare the output string, since the order
         # of variable could be changed.
-        print prog
-        print prog.clone()
+        print(prog)
+        print(prog.clone())
 
     def test_parse_program_from_string(self):
         prog = Program()
@@ -67,8 +83,8 @@ class TestProgram(unittest.TestCase):
         binary_str = prog.desc.serialize_to_string()
         prog_restored = Program.parse_from_string(binary_str)
 
-        print prog
-        print prog_restored
+        print(prog)
+        print(prog_restored)
 
     def test_append_backward(self):
         prog = Program()
@@ -106,12 +122,10 @@ class TestProgram(unittest.TestCase):
         self.assertEqual(add_op.idx, 1)
         param_to_grad = prog.append_backward(mean_out, set())
 
-        def grad_name(name):
-            return name + "@GRAD"
-
         for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out",
                          "mean.out"):
-            self.assertEqual(param_to_grad[var_name][0], grad_name(var_name))
+            self.assertEqual(param_to_grad[var_name][0],
+                             grad_var_name(var_name))
             self.assertEqual(param_to_grad[var_name][1], 0)
 
         expect_ops = [
@@ -123,6 +137,17 @@ class TestProgram(unittest.TestCase):
             actual_ops.append(op.type)
         self.assertEqual(actual_ops, expect_ops)
 
+    def test_program_clone_with_parameter(self):
+        main_program = Program()
+        startup_program = Program()
+        with program_guard(main_program, startup_program):
+            d = layers.data(name='x', shape=[784], dtype='float32')
+            hidden = layers.fc(input=d, size=100)
+            layers.fc(input=hidden, size=100)
+
+        new_program = main_program.clone()
+        self.assertNotEqual(0, len(new_program.blocks[0].all_parameters()))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_protobuf.py b/python/paddle/v2/fluid/tests/test_protobuf.py
index e064374176fa221cfd042b7dbd2ddcb3b5ec41ec..5f0646d03603dfe3e37ac23e398812517db9ac17 100644
--- a/python/paddle/v2/fluid/tests/test_protobuf.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 import unittest
 
diff --git a/python/paddle/v2/fluid/tests/test_protobuf_descs.py b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
index 098a9802dfc6763ce2a2356b7267a439145b7939..24638dc0e8a74b60fa0a3ed1852093728c0bc79c 100644
--- a/python/paddle/v2/fluid/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import paddle.v2.fluid.core as core
 
@@ -101,13 +114,13 @@ class TestVarDesc(unittest.TestCase):
         self.assertEqual(src_shape, res_shape)
         self.assertEqual(core.VarDesc.VarType.SELECTED_ROWS, var.type())
 
-    def test_data_type(self):
+    def test_dtype(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
         var = block.var('my_var')
         var.set_type(core.VarDesc.VarType.LOD_TENSOR)
-        var.set_data_type(core.DataType.INT32)
-        self.assertEqual(core.DataType.INT32, var.data_type())
+        var.set_dtype(core.DataType.INT32)
+        self.assertEqual(core.DataType.INT32, var.dtype())
         self.assertEqual(core.VarDesc.VarType.LOD_TENSOR, var.type())
 
 
diff --git a/python/paddle/v2/fluid/tests/test_proximal_adagrad_op.py b/python/paddle/v2/fluid/tests/test_proximal_adagrad_op.py
index f89a493ab7a7a3d841088b7db37bff4dfbe63735..c197d850f978a2e0de67e0252d79c429c66c1a77 100644
--- a/python/paddle/v2/fluid/tests/test_proximal_adagrad_op.py
+++ b/python/paddle/v2/fluid/tests/test_proximal_adagrad_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_proximal_gd_op.py b/python/paddle/v2/fluid/tests/test_proximal_gd_op.py
index 9ca79ce6b3b710244e4f65db70b305231a9f3fcf..15452558252d09e8a81ab647a64340c30029b505 100644
--- a/python/paddle/v2/fluid/tests/test_proximal_gd_op.py
+++ b/python/paddle/v2/fluid/tests/test_proximal_gd_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_rank_loss_op.py b/python/paddle/v2/fluid/tests/test_rank_loss_op.py
index 0e41ab1b3fd8fa8b62c5f3b914b752918119a265..b4ba7920cd7aaf610075d3dee37f0b7825b387bd 100644
--- a/python/paddle/v2/fluid/tests/test_rank_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_rank_loss_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_recurrent_op.py b/python/paddle/v2/fluid/tests/test_recurrent_op.py
index b623d1231838faff9e91c9234befb1f647fe8ec2..bcc3457aa3ae6dce161d0d58d6fd965b0cb87f11 100644
--- a/python/paddle/v2/fluid/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
@@ -1,9 +1,22 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, grad_var_name
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy as np
 import paddle.v2.fluid.core as core
 
@@ -118,14 +131,14 @@ class RecurrentOpTest1(unittest.TestCase):
     def create_rnn_op(self):
         x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='x',
             append_batch_size=False,
             **self.p_info)
         x.stop_gradient = False
         h_boot = layers.data(
             shape=[self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='h_boot',
             **self.p_info)
         h_boot.stop_gradient = False
@@ -156,7 +169,7 @@ class RecurrentOpTest1(unittest.TestCase):
                       feed=self.feed_map,
                       fetch_list=[self.output])
 
-        return np.array(out[0])
+        return out[0]
 
     def backward(self):
         self.feed_map = {
@@ -164,19 +177,20 @@ class RecurrentOpTest1(unittest.TestCase):
             for x in self.data_field
         }
         fetch_list = [
-            self.main_program.global_block().var(x + "@GRAD")
+            self.main_program.global_block().var(grad_var_name(x))
             for x in self.data_field
         ]
 
         exe = Executor(self.place)
         return exe.run(self.main_program,
                        feed=self.feed_map,
-                       fetch_list=fetch_list)
+                       fetch_list=fetch_list,
+                       return_numpy=False)
 
     def test_backward(self):
         self.check_forward()
 
-        append_backward_ops(self.output)
+        append_backward(self.output)
 
         ana_grad = [np.array(x) for x in self.backward()]
 
@@ -251,14 +265,14 @@ class RecurrentOpTest2(RecurrentOpTest1):
     def create_rnn_op(self):
         x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='x',
             append_batch_size=False,
             **self.p_info)
         x.stop_gradient = False
         h_boot = layers.data(
             shape=[self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='h_boot',
             **self.p_info)
         h_boot.stop_gradient = False
@@ -270,12 +284,12 @@ class RecurrentOpTest2(RecurrentOpTest1):
 
             temp_l = layers.fc(input=x_t,
                                size=self.input_dim,
-                               param_attr={'name': 'W'},
+                               param_attr='W',
                                bias_attr=False,
                                **self.p_info)
             temp_r = layers.fc(input=h_pre,
                                size=self.input_dim,
-                               param_attr={'name': 'U'},
+                               param_attr='U',
                                bias_attr=False,
                                **self.p_info)
 
@@ -350,21 +364,21 @@ class RecurrentOpMultipleMemoryTest(RecurrentOpTest1):
     def create_rnn_op(self):
         x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='x',
             append_batch_size=False,
             **self.p_info)
         x.stop_gradient = False
         h_boot1 = layers.data(
             shape=[self.batch_size, self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='h_boot1',
             append_batch_size=False,
             **self.p_info)
         h_boot1.stop_gradient = False
         h_boot2 = layers.data(
             shape=[self.batch_size, self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='h_boot2',
             append_batch_size=False,
             **self.p_info)
@@ -435,7 +449,7 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1):
     def create_rnn_op(self):
         x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='x',
             append_batch_size=False,
             **self.p_info)
@@ -453,4 +467,6 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1):
 
 
 if __name__ == '__main__':
+    # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
+    exit(0)
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_reduce_op.py b/python/paddle/v2/fluid/tests/test_reduce_op.py
index 70359d60cbe656150877673c63e81eae92d8ab9a..57ee307ba66b47bd15864e7be3943b4f5237eb1e 100644
--- a/python/paddle/v2/fluid/tests/test_reduce_op.py
+++ b/python/paddle/v2/fluid/tests/test_reduce_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -85,5 +98,19 @@ class Test1DReduce(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestReduceAll(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
+        self.attrs = {'reduce_all': True}
+        self.outputs = {'Out': self.inputs['X'].sum()}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_registry.py b/python/paddle/v2/fluid/tests/test_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..dba11896307ae52877628ab7a51854b80c7a6fec
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_registry.py
@@ -0,0 +1,35 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import warnings
+
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.registry as registry
+
+
+class TestRegistry(unittest.TestCase):
+    def test_registry_layer(self):
+        self.layer_type = "mean"
+        program = framework.Program()
+
+        x = fluid.layers.data(name='X', shape=[10, 10], dtype='float32')
+        output = layers.mean(x)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        X = np.random.random((10, 10)).astype("float32")
+        mean_out = exe.run(program, feed={"X": X}, fetch_list=[output])
+        self.assertAlmostEqual(np.mean(X), mean_out)
diff --git a/python/paddle/v2/fluid/tests/test_regularizer.py b/python/paddle/v2/fluid/tests/test_regularizer.py
index 24baf55e90c98f39bab926e8c85a791eee5ed4a4..9eaae1904a01ca7994ad493c3b37352b3d50bab6 100644
--- a/python/paddle/v2/fluid/tests/test_regularizer.py
+++ b/python/paddle/v2/fluid/tests/test_regularizer.py
@@ -1,9 +1,22 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.regularizer as regularizer
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 
 
 class TestL2DecayRegularizer(unittest.TestCase):
@@ -33,7 +46,7 @@ class TestL2DecayRegularizer(unittest.TestCase):
             dtype="float32", shape=[1], lod_level=0, name="mean.out")
         block.append_op(
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
         params_grads = optimizer.append_regularization_ops(params_grads)
@@ -70,7 +83,7 @@ class TestL1DecayRegularizer(unittest.TestCase):
             dtype="float32", shape=[1], lod_level=0, name="mean.out")
         block.append_op(
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
         params_grads = optimizer.append_regularization_ops(params_grads)
diff --git a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bcdfafcf4496a4f47c69afb7acd24427cbb634c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
@@ -0,0 +1,198 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import numpy
+
+
+class TestReorderLoDTensor(unittest.TestCase):
+    num_seq = 5
+    # [name, shape, lod_level] pair indicating data info of source and target
+    data_desc = (['input', [9], 0], ['ref', [5], 1])
+
+    @classmethod
+    def setUpClass(cls):
+        cls.set_program()
+
+    @classmethod
+    def set_program(cls):
+        dat = fluid.layers.data(
+            name=cls.data_desc[0][0], shape=cls.data_desc[0][1])
+        dat.stop_gradient = False
+        rank_dat = fluid.layers.data(
+            name=cls.data_desc[1][0], shape=cls.data_desc[1][1])
+        table = fluid.layers.lod_rank_table(rank_dat)
+        new_dat = fluid.layers.reorder_lod_tensor_by_rank(
+            x=dat, rank_table=table)
+        loss = fluid.layers.reduce_sum(new_dat)
+        fluid.backward.append_backward(loss=loss)
+        cls.fetch_list = [new_dat, cls.data_desc[0][0] + '@GRAD']
+
+    def run_program(self):
+        outputs = []
+        input_grads = []
+        places = [core.CPUPlace()]
+        if core.is_compile_gpu():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.set_inputs(place)
+            exe = fluid.Executor(place)
+            output, input_grad = exe.run(fluid.default_main_program(),
+                                         feed=self.inputs,
+                                         fetch_list=self.fetch_list,
+                                         return_numpy=False)
+            outputs.append(output)
+            input_grads.append(input_grad)
+        self.actual_outputs = outputs
+        self.actual_grads = input_grads
+
+    def set_data(self):
+        self.data = {}
+        for desc in self.data_desc:
+            data_name = desc[0]
+            data_shape = desc[1]
+            data_lod_level = desc[2]
+            data_lod = []
+            for i in range(data_lod_level):
+                lod_level_i = numpy.random.randint(
+                    low=1,
+                    high=5,
+                    size=self.num_seq if i == 0 else lod_level_i[-1])
+                lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist()
+                data_lod.append(lod_level_i)
+            data_value = numpy.random.random(
+                size=[data_lod[-1][-1] if data_lod else self.num_seq
+                      ] + data_shape).astype('float32')
+            self.data[data_name] = (data_value, data_lod)
+
+    def set_inputs(self, place):
+        self.inputs = {}
+        for desc in self.data_desc:
+            tensor = fluid.Tensor()
+            tensor.set(self.data[desc[0]][0], place)
+            if self.data[desc[0]][1]:
+                tensor.set_lod(self.data[desc[0]][1])
+            self.inputs[desc[0]] = tensor
+
+    def reorder(self):
+        level = 0
+
+        # compute the rank_table according to ref_lod
+        ref_lod = self.data[self.data_desc[1][0]][1][level]
+        rank_table = []  # list of (index, length)
+        for i in range(len(ref_lod) - 1):
+            rank_table.append((i, ref_lod[i + 1] - ref_lod[i]))
+        rank_table = sorted(rank_table, lambda x, y: y[1] - x[1])
+
+        # compute the input sequence info according to input_lod
+        input_value, input_lod = self.data[self.data_desc[0][0]]
+
+        input_table = []  # list of (offset, length, sub_lod)
+        if input_lod:
+            for i in range(len(input_lod[level]) - 1):
+                start_idx = i
+                end_idx = i + 1
+                sub_lod = []
+                for lod_level_i in input_lod[level:]:
+                    sub_lod_i = []
+                    for idx in range(start_idx, end_idx):
+                        sub_lod_i.append(lod_level_i[idx + 1] - lod_level_i[
+                            idx])
+                    sub_lod.append(sub_lod_i)
+                    start_idx = lod_level_i[start_idx]
+                    end_idx = lod_level_i[end_idx]
+                input_table.append((start_idx, end_idx - start_idx, sub_lod))
+        else:
+            input_table = [(i, 1, []) for i in range(len(rank_table))]
+
+        # reorder by rank_table
+        output_value = numpy.zeros_like(input_value)
+        output_lod = []
+        offset = 0
+        for index, length in rank_table:
+            input_seq_start = input_table[index][0]
+            input_seq_len = input_table[index][1]
+            input_seq_end = input_seq_start + input_seq_len
+            output_value[offset:offset + input_seq_len] = input_value[
+                input_seq_start:input_seq_end]
+            offset += input_seq_len
+
+            input_seq_sub_lod = input_table[index][2]
+            if len(output_lod) == 0:
+                output_lod = [[0] for i in input_seq_sub_lod]
+            for i, sub_lod_i in enumerate(input_seq_sub_lod):
+                for idx_sub in sub_lod_i:
+                    output_lod[i].append(output_lod[i][-1] + idx_sub)
+        return output_value, output_lod
+
+    def test_reorder_lod_tensor(self):
+        self.data_desc[0][-1] = 2  # input is lod_tensor
+        self.set_data()
+        self.run_program()
+        # check output
+        expect_output, expect_output_lod = self.reorder()
+        for actual_output in self.actual_outputs:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_output), expect_output, atol=0.001))
+            self.assertEqual(expect_output_lod, actual_output.lod())
+        # check gradient
+        expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
+        expect_grad_lod = self.data[self.data_desc[0][0]][1]
+        for actual_grad in self.actual_grads:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_grad), expect_grad, atol=0.001))
+            self.assertEqual(expect_grad_lod, actual_grad.lod())
+
+    def test_reorder_tensor(self):
+        self.data_desc[0][-1] = 0  # input is tensor
+        self.set_data()
+        self.run_program()
+        # check output
+        expect_output, expect_output_lod = self.reorder()
+        for actual_output in self.actual_outputs:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_output), expect_output, atol=0.001))
+            self.assertEqual(expect_output_lod, actual_output.lod())
+        # check gradient
+        expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
+        expect_grad_lod = self.data[self.data_desc[0][0]][1]
+        for actual_grad in self.actual_grads:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_grad), expect_grad, atol=0.001))
+            self.assertEqual(expect_grad_lod, actual_grad.lod())
+
+        # compare outputs between LodTensors with explicit and implicit lod
+        # use the same data but set the input lod explicitly
+        input_lod = [[
+            i for i in range(len(self.data[self.data_desc[0][0]][0]) + 1)
+        ]]
+        self.inputs[self.data_desc[0][0]].set_lod(input_lod)
+        # preserve the output of LodTensor with implicit lod to compare
+        expect_output = [
+            numpy.array(actual_output) for actual_output in self.actual_outputs
+        ]
+        self.run_program()
+        for actual_output in self.actual_outputs:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_output), expect_output, atol=0.001))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_reshape_op.py b/python/paddle/v2/fluid/tests/test_reshape_op.py
index 16bb6bb2af67f7d32a2fafc1cb37412084ec0829..d6e6797043dc07cdc62f41a3d6dac43fc25934d0 100644
--- a/python/paddle/v2/fluid/tests/test_reshape_op.py
+++ b/python/paddle/v2/fluid/tests/test_reshape_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -17,5 +30,19 @@ class TestReshapeOp(OpTest):
         self.check_grad(["X"], "Out")
 
 
+class TestReshapeOpDimInfer(OpTest):
+    def setUp(self):
+        self.op_type = "reshape"
+        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
+        self.attrs = {'shape': [4, -1, 5]}
+        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_rmsprop_op.py b/python/paddle/v2/fluid/tests/test_rmsprop_op.py
index 237bcfccceee89f62fc05e4c6c972a76d1875367..27a1ea213714271fb373e270add039bc4667e6fd 100644
--- a/python/paddle/v2/fluid/tests/test_rmsprop_op.py
+++ b/python/paddle/v2/fluid/tests/test_rmsprop_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
index a3cba92504a28590083df57e69f7662a887d94a6..378d7f852304e4cc41ed760663457e470985afbc 100644
--- a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
+++ b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
@@ -1,18 +1,25 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 
 from paddle.v2.fluid.framework import Program
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy as np
 import paddle.v2.fluid.core as core
 
 
-def create_tensor(np_data, place):
-    tensor = core.LoDTensor()
-    tensor.set(np_data, place)
-    return tensor
-
-
 class RNNMemoryHelperOpTest(unittest.TestCase):
     def setUp(self):
         self.program = Program()
@@ -30,13 +37,13 @@ class RNNMemoryHelperOpTest(unittest.TestCase):
 
     def test_forward(self):
         x_np = np.random.normal(size=(2, 3)).astype("float32")
-        self.feed_map = {'X': create_tensor(x_np, self.place)}
+        self.feed_map = {'X': x_np}
         self.fetch_list = [self.Out]
         exe = Executor(self.place)
         out = exe.run(self.program,
                       feed=self.feed_map,
                       fetch_list=self.fetch_list)
-        np.isclose(np.array(out[0]), x_np, rtol=1e-5)
+        self.assertTrue(np.allclose(out[0], x_np, rtol=1e-5))
 
 
 class RNNMemoryHelperGradOpTest(unittest.TestCase):
@@ -66,8 +73,7 @@ class RNNMemoryHelperGradOpTest(unittest.TestCase):
 
     def test_backward(self):
         self.feed_map = {
-            name: create_tensor(
-                np.random.normal(size=(2, 3)).astype("float32"), self.place)
+            name: np.random.normal(size=(2, 3)).astype("float32")
             for name in self.input_names
         }
         self.fetch_list = [self.output_vars['X@GRAD']]
@@ -76,7 +82,7 @@ class RNNMemoryHelperGradOpTest(unittest.TestCase):
         out = exe.run(self.program,
                       feed=self.feed_map,
                       fetch_list=self.fetch_list)
-        np.isclose(np.array(out[0]), self.feed_map['Out@GRAD'], rtol=1e-5)
+        np.isclose(out[0], self.feed_map['Out@GRAD'], rtol=1e-5)
 
 
 class RNNMemoryHelperGradOpWithoutInputTest(unittest.TestCase):
@@ -110,8 +116,7 @@ class RNNMemoryHelperGradOpWithoutInputTest(unittest.TestCase):
 
     def test_backward(self):
         self.feed_map = {
-            name: create_tensor(
-                np.random.normal(size=(2, 3)).astype("float32"), self.place)
+            name: np.random.normal(size=(2, 3)).astype("float32")
             for name in ['X', 'Out']
         }
         self.fetch_list = [self.output_vars['X@GRAD']]
@@ -120,10 +125,9 @@ class RNNMemoryHelperGradOpWithoutInputTest(unittest.TestCase):
         out = exe.run(self.program,
                       feed=self.feed_map,
                       fetch_list=self.fetch_list)
-        np.isclose(
-            np.array(out[0]),
-            np.zeros(shape=(2, 3)).astype("float32"),
-            rtol=1e-5)
+        self.assertTrue(
+            np.allclose(
+                out[0], np.zeros(shape=(2, 3)).astype("float32"), rtol=1e-5))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_roi_pool_op.py b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d7a698b09e288696caf9ad8460f8df32c11b009
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
@@ -0,0 +1,136 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+import math
+import sys
+from op_test import OpTest
+
+
+class TestROIPoolOp(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.calc_roi_pool()
+
+        self.inputs = {'X': self.x, 'ROIs': self.rois}
+
+        self.attrs = {
+            'spatial_scale': self.spatial_scale,
+            'pooled_height': self.pooled_height,
+            'pooled_width': self.pooled_width
+        }
+
+        self.outputs = {'Out': self.outs, 'Argmax': self.argmaxes}
+
+    def init_test_case(self):
+        self.batch_size = 5
+        self.channels = 3
+        self.height = 6
+        self.width = 4
+
+        # n, c, h, w
+        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
+
+        self.spatial_scale = 1.0 / 4.0
+        self.pooled_height = 2
+        self.pooled_width = 2
+        self.rois_num = 2
+
+        self.x = np.random.random(self.x_dim).astype('float32')
+
+    def calc_roi_pool(self):
+        out_data = np.zeros((self.rois_num, self.channels, self.pooled_height,
+                             self.pooled_width))
+        argmax_data = np.zeros((self.rois_num, self.channels,
+                                self.pooled_height, self.pooled_width))
+
+        for i in range(self.rois_num):
+            roi = self.rois[i]
+            roi_batch_id = roi[0]
+            roi_start_w = int(round(roi[1] * self.spatial_scale))
+            roi_start_h = int(round(roi[2] * self.spatial_scale))
+            roi_end_w = int(round(roi[3] * self.spatial_scale))
+            roi_end_h = int(round(roi[4] * self.spatial_scale))
+
+            roi_height = int(max(roi_end_h - roi_start_h + 1, 1))
+            roi_width = int(max(roi_end_w - roi_start_w + 1, 1))
+
+            x_i = self.x[roi_batch_id]
+
+            bin_size_h = float(roi_height) / float(self.pooled_height)
+            bin_size_w = float(roi_width) / float(self.pooled_width)
+
+            for c in range(self.channels):
+                for ph in range(self.pooled_height):
+                    for pw in range(self.pooled_width):
+                        hstart = int(math.floor(ph * bin_size_h))
+                        wstart = int(math.floor(pw * bin_size_w))
+                        hend = int(math.ceil((ph + 1) * bin_size_h))
+                        wend = int(math.ceil((pw + 1) * bin_size_w))
+
+                        hstart = min(max(hstart + roi_start_h, 0), self.height)
+                        hend = min(max(hend + roi_start_h, 0), self.height)
+                        wstart = min(max(wstart + roi_start_w, 0), self.width)
+                        wend = min(max(wend + roi_start_w, 0), self.width)
+
+                        is_empty = (hend <= hstart) or (wend <= wstart)
+                        if is_empty:
+                            out_data[i, c, ph, pw] = 0
+                        else:
+                            out_data[i, c, ph, pw] = -sys.float_info.max
+
+                        argmax_data[i, c, ph, pw] = -1
+
+                        for h in range(hstart, hend):
+                            for w in range(wstart, wend):
+                                if x_i[c, h, w] > out_data[i, c, ph, pw]:
+                                    out_data[i, c, ph, pw] = x_i[c, h, w]
+                                    argmax_data[i, c, ph, pw] = h * \
+                                        self.width + w
+
+        self.outs = out_data.astype('float32')
+        self.argmaxes = argmax_data.astype('int64')
+
+    def make_rois(self):
+        rois = []
+        batch_ids = np.random.randint(0, self.batch_size, size=self.rois_num)
+        for i in range(self.rois_num):
+            x1 = np.random.random_integers(
+                0, self.width / self.spatial_scale - self.pooled_width)
+            y1 = np.random.random_integers(
+                0, self.height / self.spatial_scale - self.pooled_height)
+
+            x2 = np.random.random_integers(x1 + self.pooled_width,
+                                           self.width / self.spatial_scale)
+            y2 = np.random.random_integers(y1 + self.pooled_height,
+                                           self.height / self.spatial_scale)
+
+            roi = [batch_ids[i], x1, y1, x2, y2]
+            rois.append(roi)
+        self.rois = np.array(rois).astype("int64")
+
+    def setUp(self):
+        self.op_type = "roi_pool"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_row_conv_op.py b/python/paddle/v2/fluid/tests/test_row_conv_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1234d289cb2f021669b227237be9b0032f4ce935
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_row_conv_op.py
@@ -0,0 +1,108 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def row_conv_forward(x, lod, wt):
+    out = np.zeros_like(x)
+    seq_info = lod[0]
+    num_sequences = len(seq_info) - 1
+    context_length = wt.shape[0]
+
+    for i in range(num_sequences):  # loop over number of sequences
+        start = seq_info[i]
+        end = seq_info[i + 1]
+        curinput = x[start:end, :]
+        curoutput = out[start:end, :]
+
+        cur_timesteps = end - start
+        for j in range(cur_timesteps):  # loop over different timesteps
+            for k in range(context_length):
+
+                if j + k >= cur_timesteps:
+                    continue
+                curoutput[j, :] += curinput[j + k, :] * wt[k, :]
+
+    return out
+
+
+class TestRowConvOp1(OpTest):
+    def setUp(self):
+
+        self.op_type = "row_conv"
+        lod = [[0, 2, 5, 7]]
+        T = lod[0][-1]
+        D = 16
+        context_length = 2
+
+        x = np.random.random((T, D)).astype("float32")
+        wt = np.random.random((context_length, D)).astype("float32")
+        self.inputs = {'X': (x, lod), 'Filter': wt}
+
+        out = row_conv_forward(x, lod, wt)
+        self.outputs = {'Out': (out, lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Filter'], 'Out', max_relative_error=0.05)
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(
+            ['Filter'], 'Out', max_relative_error=0.05, no_grad_set=set('X'))
+
+    def test_check_grad_ignore_wt(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Filter'))
+
+
+class TestRowConvOp2(OpTest):
+    def setUp(self):
+
+        self.op_type = "row_conv"
+        lod = [[0, 20, 50, 100]]
+        T = lod[0][-1]
+        D = 35
+        context_length = 35
+
+        x = np.random.random((T, D)).astype("float32")
+        wt = np.random.random((context_length, D)).astype("float32")
+        self.inputs = {'X': (x, lod), 'Filter': wt}
+
+        out = row_conv_forward(x, lod, wt)
+        self.outputs = {'Out': (out, lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    #max_relative_error is increased from 0.05 to 0.06 as for higher
+    #dimensional input, the dX on CPU for some values has max_rel_error 
+    #slightly more than 0.05
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Filter'], 'Out', max_relative_error=0.06)
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(
+            ['Filter'], 'Out', max_relative_error=0.06, no_grad_set=set('X'))
+
+    def test_check_grad_ignore_wt(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.06, no_grad_set=set('Filter'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_scale_op.py b/python/paddle/v2/fluid/tests/test_scale_op.py
index 2ea1e185470280730ae8c8c0ea9568bbeb43eaf5..9847d3d36198c1078958e06ad87590f812e4eaa8 100644
--- a/python/paddle/v2/fluid/tests/test_scale_op.py
+++ b/python/paddle/v2/fluid/tests/test_scale_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_scatter_op.py b/python/paddle/v2/fluid/tests/test_scatter_op.py
index 1032269d5dfb02e3518b9ef2820d5d0dcc8a51a0..b6c4162f6f47c5eb5b8d6a0308fc80baeb37e14c 100644
--- a/python/paddle/v2/fluid/tests/test_scatter_op.py
+++ b/python/paddle/v2/fluid/tests/test_scatter_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_scope.py b/python/paddle/v2/fluid/tests/test_scope.py
index e4857b590aa6e09f1fa37c4a8a70a3ec9495b085..adaaf1690627e20a9e8d8b21796d061cb6fdc2dc 100644
--- a/python/paddle/v2/fluid/tests/test_scope.py
+++ b/python/paddle/v2/fluid/tests/test_scope.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import paddle.v2.fluid.core
 import unittest
 
diff --git a/python/paddle/v2/fluid/tests/test_selected_rows.py b/python/paddle/v2/fluid/tests/test_selected_rows.py
index 93daf37aa2ceb8a599973f7b02874f23fe0763ff..3179a3caaecfc17a71f15f41d69cb82369ed5998 100644
--- a/python/paddle/v2/fluid/tests/test_selected_rows.py
+++ b/python/paddle/v2/fluid/tests/test_selected_rows.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import paddle.v2.fluid.core as core
 import unittest
 import numpy as np
diff --git a/python/paddle/v2/fluid/tests/test_seq_concat_op.py b/python/paddle/v2/fluid/tests/test_seq_concat_op.py
index dccc6ed8afe2315da74f6886878b15d58b26b3c9..1f026fd76e83a270a7b20815b6ad1f397c062bd7 100644
--- a/python/paddle/v2/fluid/tests/test_seq_concat_op.py
+++ b/python/paddle/v2/fluid/tests/test_seq_concat_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/v2/fluid/tests/test_seq_conv.py b/python/paddle/v2/fluid/tests/test_seq_conv.py
index 14edc5f953022ca05f5620c28bd7276d961dd4d0..c7e508519446fc69758d0dbc91b1ea9bacb9f11b 100644
--- a/python/paddle/v2/fluid/tests/test_seq_conv.py
+++ b/python/paddle/v2/fluid/tests/test_seq_conv.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 import random
diff --git a/python/paddle/v2/fluid/tests/test_seq_pool.py b/python/paddle/v2/fluid/tests/test_seq_pool.py
index 512d8b315f29cecf79ae274dca491c240f3447a1..bb15495373fb083b1c7dc031c7286dceef7e4ecf 100644
--- a/python/paddle/v2/fluid/tests/test_seq_pool.py
+++ b/python/paddle/v2/fluid/tests/test_seq_pool.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_sequence_erase_op.py b/python/paddle/v2/fluid/tests/test_sequence_erase_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..650984009a76a56fb65811f7bb805ca656194a35
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_sequence_erase_op.py
@@ -0,0 +1,48 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def sequence_erase(in_seq, lod0, tokens):
+    new_lod0 = [0]
+    out_seq = []
+    for i in range(0, len(lod0) - 1):
+        num_out = 0
+        for dat in in_seq[lod0[i]:lod0[i + 1]]:
+            if dat not in tokens:
+                out_seq.append(dat)
+                num_out += 1
+        new_lod0.append(new_lod0[-1] + num_out)
+    return np.array(out_seq).astype("int32"), new_lod0
+
+
+class TestSequenceEraseOp(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_erase"
+        in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        lod = [[0, 9, 13, 24, 30]]
+        tokens = [2, 3, 5]
+        out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
+        self.attrs = {'tokens': tokens}
+        self.inputs = {'X': (in_seq, lod)}
+        self.outputs = {'Out': (out_seq, [new_lod0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_seq_expand.py b/python/paddle/v2/fluid/tests/test_sequence_expand.py
similarity index 69%
rename from python/paddle/v2/fluid/tests/test_seq_expand.py
rename to python/paddle/v2/fluid/tests/test_sequence_expand.py
index ff17edd04bfd34ab8449a0ae05aacf66632dabc8..aacdabf295dc1c26c62db5ebfa6961df9fed5816 100644
--- a/python/paddle/v2/fluid/tests/test_seq_expand.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_expand.py
@@ -1,9 +1,22 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
 
 
-class TestSeqExpand(OpTest):
+class TestSequenceExpand(OpTest):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
         y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
@@ -21,7 +34,7 @@ class TestSeqExpand(OpTest):
         self.outputs = {'Out': out}
 
     def setUp(self):
-        self.op_type = 'seq_expand'
+        self.op_type = 'sequence_expand'
         self.set_data()
         self.compute()
 
@@ -32,7 +45,7 @@ class TestSeqExpand(OpTest):
         self.check_grad(["X"], "Out")
 
 
-class TestSeqExpandCase1(TestSeqExpand):
+class TestSequenceExpandCase1(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
         x_lod = [[0, 2, 5]]
@@ -41,7 +54,7 @@ class TestSeqExpandCase1(TestSeqExpand):
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
-class TestSeqExpandCase2(TestSeqExpand):
+class TestSequenceExpandCase2(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
         x_lod = [[0, 1]]
@@ -50,7 +63,7 @@ class TestSeqExpandCase2(TestSeqExpand):
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
-class TestSeqExpandCase3(TestSeqExpand):
+class TestSequenceExpandCase3(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
         x_lod = [[0, 1, 2, 3, 4]]
diff --git a/python/paddle/v2/fluid/tests/test_sequence_slice_op.py b/python/paddle/v2/fluid/tests/test_sequence_slice_op.py
index ccd9a05343b0c4aa05b258959665c0662f271512..94062431f0b4297535c20c4b86a11a8058c8876e 100644
--- a/python/paddle/v2/fluid/tests/test_sequence_slice_op.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_slice_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py b/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py
index b54a56aa6d3f76baa4d1fc6ba8f963332deba002..8170e4d7f18d0fe28d8f927f122139369ee672fe 100644
--- a/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py
@@ -1,13 +1,20 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
-
-
-def stable_softmax(x):
-    """Compute the softmax of vector x in a numerically stable way."""
-    shiftx = x - np.max(x).clip(-64.)
-    exps = np.exp(shiftx)
-    return exps / np.sum(exps)
+from test_softmax_op import stable_softmax
 
 
 class TestSequenceSoftmaxOp(OpTest):
diff --git a/python/paddle/v2/fluid/tests/test_sgd_op.py b/python/paddle/v2/fluid/tests/test_sgd_op.py
index ca05a381f06cfd40b7939dbda8d4f1f4aacd0271..4a71fb30a9c7a185be8150a476a6bb317d19e3eb 100644
--- a/python/paddle/v2/fluid/tests/test_sgd_op.py
+++ b/python/paddle/v2/fluid/tests/test_sgd_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 import paddle.v2.fluid.core as core
@@ -55,8 +68,7 @@ class TestSparseSGDOp(unittest.TestCase):
             Grad='Grad',
             ParamOut='Param',
             LearningRate='LearningRate')
-        ctx = core.DeviceContext.create(place)
-        sgd_op.run(scope, ctx)
+        sgd_op.run(scope, place)
 
         # get and compare result
         result_array = np.array(param)
@@ -79,7 +91,7 @@ class TestSparseSGDOp(unittest.TestCase):
     def test_sparse_sgd(self):
         places = [core.CPUPlace()]
         if core.is_compile_gpu():
-            places.append(core.GPUPlace(0))
+            places.append(core.CUDAPlace(0))
         for place in places:
             self.check_with_place(place)
 
diff --git a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
index 1a3b88e18e38b88d75ad17a0bb6a2965d1e60406..1825a5258fa1ab08c405d20e3e77b4d92a200e7f 100644
--- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
@@ -1,46 +1,101 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.backward import append_backward_ops
-from paddle.v2.fluid.framework import g_main_program
-import numpy
+from paddle.v2.fluid.backward import append_backward
+from paddle.v2.fluid.framework import default_main_program, switch_main_program
+from paddle.v2.fluid.framework import Program
+import numpy as np
 
 
-class TestShrinkRNNMemory(unittest.TestCase):
-    def test_shrink_rnn_memory(self):
-        x = layers.data('x', shape=[100], data_type='float32')
+class TestShrinkRNNMemoryBase(unittest.TestCase):
+    def setUp(self):
+        self.main_program = Program()
+        switch_main_program(self.main_program)
+        x = layers.data('x', shape=[100], dtype='float32')
         x.stop_gradient = False
-        table = layers.lod_rank_table(x=x)
+        rank_table_tensor = layers.data(
+            'rank_table_tensor', shape=[1], dtype='float32', lod_level=1)
+        table = layers.lod_rank_table(x=rank_table_tensor)
         i = layers.zeros(dtype='int64', shape=[1])
-        mem1 = layers.shrink_memory(x=x, i=i, table=table)
+        self.mem1 = layers.shrink_memory(x=x, i=i, table=table)
         i = layers.increment(x=i)
         i.stop_gradient = True
-        mem2 = layers.shrink_memory(x=mem1, i=i, table=table)
+        self.mem2 = layers.shrink_memory(x=self.mem1, i=i, table=table)
         i = layers.increment(x=i)
         i.stop_gradient = True
-        mem3 = layers.shrink_memory(x=mem2, i=i, table=table)
+        self.mem3 = layers.shrink_memory(x=self.mem2, i=i, table=table)
+        mem3_mean = layers.mean(x=self.mem3)
+        append_backward(loss=mem3_mean)
+        self.x_grad = self.main_program.global_block().var('x@GRAD')
 
+    def sum_lodtensor(self, tensor):
+        sum_res = 0.0
+        for i in xrange(np.product(tensor.get_dims())):
+            sum_res += tensor.get_float_element(i)
+        return sum_res
+
+
+class TestShrinkRNNMemoryReferLoD(TestShrinkRNNMemoryBase):
+    def test_refer_lod(self):
+        cpu = core.CPUPlace()
+        x_tensor = core.LoDTensor()
+        x_tensor.set_lod([[0, 2, 5, 6]])
+        tensor_np = np.random.random(size=(6, 100)).astype('float32')
+        x_tensor.set(tensor_np, cpu)
+
+        rank_table_tensor = core.LoDTensor()
+        rank_table_tensor.set_lod([[0, 1, 3, 6]])
+        rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
+                              cpu)
+
+        exe = Executor(cpu)
+        outs = exe.run(
+            feed={'x': x_tensor,
+                  'rank_table_tensor': rank_table_tensor},
+            fetch_list=[self.mem1, self.mem2, self.mem3, self.x_grad],
+            return_numpy=False)
+        self.assertTrue(np.allclose(tensor_np[0:6], outs[0]))
+        self.assertTrue(np.allclose(tensor_np[0:5], outs[1]))
+        self.assertTrue(np.allclose(tensor_np[0:2], outs[2]))
+        self.assertAlmostEqual(1.0, self.sum_lodtensor(outs[3]), delta=0.01)
+
+
+class TestShrinkRNNMemoryNoLoD(TestShrinkRNNMemoryBase):
+    def test_no_lod(self):
         cpu = core.CPUPlace()
-        tensor = core.LoDTensor()
-        tensor.set_lod([[0, 2, 5, 6]])
-        tensor_np = numpy.random.random(size=(3, 100)).astype('float32')
-        tensor.set(tensor_np, cpu)
+        x_tensor = core.LoDTensor()
+        tensor_np = np.random.random(size=(3, 100)).astype('float32')
+        x_tensor.set(tensor_np, cpu)
+
+        rank_table_tensor = core.LoDTensor()
+        rank_table_tensor.set_lod([[0, 1, 3, 6]])
+        rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
+                              cpu)
+
         exe = Executor(cpu)
-        outs = map(numpy.array,
-                   exe.run(feed={'x': tensor}, fetch_list=[mem1, mem2, mem3]))
-        self.assertTrue(numpy.allclose(tensor_np[0:3], outs[0]))
-        self.assertTrue(numpy.allclose(tensor_np[0:2], outs[1]))
-        self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2]))
-
-        mem3_mean = layers.mean(x=mem3)
-        append_backward_ops(loss=mem3_mean)
-        x_grad = map(numpy.array,
-                     exe.run(feed={'x': tensor},
-                             fetch_list=[
-                                 g_main_program.global_block().var('x@GRAD')
-                             ]))[0]
-        self.assertAlmostEqual(1.0, x_grad.sum(), delta=0.1)
+        outs = exe.run(
+            feed={'x': x_tensor,
+                  'rank_table_tensor': rank_table_tensor},
+            fetch_list=[self.mem1, self.mem2, self.mem3, self.x_grad],
+            return_numpy=False)
+        self.assertTrue(np.allclose(tensor_np[0:3], outs[0]))
+        self.assertTrue(np.allclose(tensor_np[0:2], outs[1]))
+        self.assertTrue(np.allclose(tensor_np[0:1], outs[2]))
+        self.assertAlmostEqual(1.0, self.sum_lodtensor(outs[3]), delta=0.01)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
index e53856b38aa5ddd6061b350a66e9fe86bc23923c..132502c9cba5af71b2477cded1a9bc63ed842a56 100644
--- a/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -1,12 +1,26 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import numpy as np
 from op_test import OpTest
 from scipy.special import logit
 from scipy.special import expit
+import unittest
 
 
 class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
-    '''Test sigmoid_cross_entropy_with_logit_op with binary labels
-    '''
+    """Test sigmoid_cross_entropy_with_logit_op with binary label
+    """
 
     def setUp(self):
         self.op_type = "sigmoid_cross_entropy_with_logits"
@@ -16,16 +30,16 @@ class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
             'X': logit(
                 np.random.uniform(0, 1, (batch_size, num_classes))
                 .astype("float32")),
-            'Labels': np.random.randint(0, 2, (batch_size, num_classes))
+            'Label': np.random.randint(0, 2, (batch_size, num_classes))
             .astype("float32")
         }
 
         # Fw Pass is implemented as elementwise sigmoid followed by
         # elementwise logistic loss
-        # Labels * -log(sigmoid(X)) + (1 - labels) * -log(1 - sigmoid(X))
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
         sigmoid_X = expit(self.inputs['X'])
-        term1 = self.inputs['Labels'] * np.log(sigmoid_X)
-        term2 = (1 - self.inputs['Labels']) * np.log(1 - sigmoid_X)
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
         self.outputs = {'Out': -term1 - term2}
 
     def test_check_output(self):
@@ -36,8 +50,8 @@ class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
 
 
 class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
-    '''Test sigmoid_cross_entropy_with_logit_op with probabalistic labels
-    '''
+    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
+    """
 
     def setUp(self):
         self.op_type = "sigmoid_cross_entropy_with_logits"
@@ -47,16 +61,16 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
             'X': logit(
                 np.random.uniform(0, 1, (batch_size, num_classes))
                 .astype("float32")),
-            'Labels': np.random.uniform(0, 1, (batch_size, num_classes))
+            'Label': np.random.uniform(0, 1, (batch_size, num_classes))
             .astype("float32")
         }
 
         # Fw Pass is implemented as elementwise sigmoid followed by
         # elementwise logistic loss
-        # Labels * -log(sigmoid(X)) + (1 - labels) * -log(1 - sigmoid(X))
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
         sigmoid_X = expit(self.inputs['X'])
-        term1 = self.inputs['Labels'] * np.log(sigmoid_X)
-        term2 = (1 - self.inputs['Labels']) * np.log(1 - sigmoid_X)
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
         self.outputs = {'Out': -term1 - term2}
 
     def test_check_output(self):
@@ -64,3 +78,7 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
 
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_sign_op.py b/python/paddle/v2/fluid/tests/test_sign_op.py
index c6b59bcfd8ba71e54d4c3a2b7a3dac1f2a346265..f649cb9e7cd9938c697a8c36a88d5b39507e4269 100644
--- a/python/paddle/v2/fluid/tests/test_sign_op.py
+++ b/python/paddle/v2/fluid/tests/test_sign_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_smooth_l1_loss_op.py b/python/paddle/v2/fluid/tests/test_smooth_l1_loss_op.py
index b7f13c5699918d4969300499bd03e1668b2a4bca..1052eaa8b0ec281208344175a9978e25528f4e5d 100644
--- a/python/paddle/v2/fluid/tests/test_smooth_l1_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_smooth_l1_loss_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_softmax_op.py b/python/paddle/v2/fluid/tests/test_softmax_op.py
index b41c810d9a6269c934a434b085748a86deccb475..d03e50b2f1edb43ada22b0357fc136c5edccd6fe 100644
--- a/python/paddle/v2/fluid/tests/test_softmax_op.py
+++ b/python/paddle/v2/fluid/tests/test_softmax_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -17,14 +30,14 @@ class TestSoftmaxOp(OpTest):
             'X': np.random.uniform(0.1, 1, [10, 10]).astype("float32")
         }
         self.outputs = {
-            'Y': np.apply_along_axis(stable_softmax, 1, self.inputs['X'])
+            'Out': np.apply_along_axis(stable_softmax, 1, self.inputs['X'])
         }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y')
+        self.check_grad(['X'], 'Out')
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/v2/fluid/tests/test_softmax_with_cross_entropy_op.py b/python/paddle/v2/fluid/tests/test_softmax_with_cross_entropy_op.py
index c2f07f9096c69f3d4977f9444bdd5dcda8028973..330467081b41d980599dff7804e84a8bc000912a 100644
--- a/python/paddle/v2/fluid/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/fluid/tests/test_softmax_with_cross_entropy_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 
diff --git a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
index 3aed83b2ea3418c54f9540279ae6e2e0045421fa..4e90404eca49213c1fe2a5a35a912cd048ce7af8 100644
--- a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
@@ -1,10 +1,23 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import paddle.v2.fluid.core as core
 import numpy as np
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
@@ -75,30 +88,30 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
              level=0):
         place = self.place()
         program = Program()
-        x = layers.data(name='x', shape=[1], main_program=program)
-        x.persistable = True
+        with program_guard(program):
+            x = layers.data(name='x', shape=[1])
+            x.persistable = True
 
-        y = layers.data(name='y', shape=[1], main_program=program)
-        y.persistable = True
+            y = layers.data(name='y', shape=[1])
+            y.persistable = True
 
-        out_true, out_false = layers.split_lod_tensor(
-            input=x, mask=y, level=level, main_program=program)
-        out_true.persistable = True
-        out_false.persistable = True
+            out_true, out_false = layers.split_lod_tensor(
+                input=x, mask=y, level=level)
+            out_true.persistable = True
+            out_false.persistable = True
 
-        out = layers.merge_lod_tensor(
-            in_true=out_true,
-            in_false=out_false,
-            mask=y,
-            x=x,
-            level=level,
-            main_program=program)
+            out = layers.merge_lod_tensor(
+                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
 
-        out.persistable = True
+            out.persistable = True
 
         exe = Executor(place)
         scope = core.Scope()
-        exe.run(program, feed={'x': tensor, 'y': mask}, scope=scope)
+        exe.run(program,
+                feed={'x': tensor,
+                      'y': mask},
+                scope=scope,
+                return_numpy=False)
 
         var_true = scope.find_var(out_true.name).get_tensor()
 
@@ -119,34 +132,21 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
     def test_grad(self):
         place = core.CPUPlace()
         program = Program()
+        with program_guard(program):
+            x = layers.data(
+                name='x', shape=[1], dtype='float32', stop_gradient=False)
+            y = layers.data(
+                name='y', shape=[1], dtype='bool', stop_gradient=False)
 
-        x = layers.data(
-            name='x',
-            shape=[1],
-            data_type='float32',
-            main_program=program,
-            stop_gradient=False)
-        y = layers.data(
-            name='y',
-            shape=[1],
-            data_type='bool',
-            main_program=program,
-            stop_gradient=False)
-
-        level = 0
-
-        out_true, out_false = layers.split_lod_tensor(
-            input=x, mask=y, level=level, main_program=program)
-        out = layers.merge_lod_tensor(
-            in_true=out_true,
-            in_false=out_false,
-            mask=y,
-            x=x,
-            level=level,
-            main_program=program)
-        mean = layers.mean(x=out, main_program=program)
-
-        append_backward_ops(mean)
+            level = 0
+
+            out_true, out_false = layers.split_lod_tensor(
+                input=x, mask=y, level=level)
+            out = layers.merge_lod_tensor(
+                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
+            mean = layers.mean(x=out)
+
+            append_backward(mean)
 
         tensor = core.LoDTensor()
         tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place)
@@ -169,7 +169,8 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
                                     feed={'x': tensor,
                                           'y': mask},
                                     fetch_list=[g_vars],
-                                    scope=scope))
+                                    scope=scope,
+                                    return_numpy=False))
         ]
 
         g_out_sum = np.array(g_out).sum()
diff --git a/python/paddle/v2/fluid/tests/test_split_op.py b/python/paddle/v2/fluid/tests/test_split_op.py
index 37c6ebb89d1c3bcfc3c80a54a1e92c0326e046e3..000c300446f7a026477eba2e854b94d20969b5a3 100644
--- a/python/paddle/v2/fluid/tests/test_split_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_spp_op.py b/python/paddle/v2/fluid/tests/test_spp_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f09bb94b449f2868575d081490027efc472b5b95
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_spp_op.py
@@ -0,0 +1,81 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_pool2d_op import max_pool2D_forward_naive
+from test_pool2d_op import avg_pool2D_forward_naive
+
+
+class TestSppOp(OpTest):
+    def setUp(self):
+        self.op_type = "spp"
+        self.init_test_case()
+        input = np.random.random(self.shape).astype("float32")
+        nsize, csize, hsize, wsize = input.shape
+        out_level_flatten = []
+        for i in xrange(self.pyramid_height):
+            bins = np.power(2, i)
+            kernel_size = [0, 0]
+            padding = [0, 0]
+            kernel_size[0] = np.ceil(hsize /
+                                     bins.astype("double")).astype("int32")
+            padding[0] = (
+                (kernel_size[0] * bins - hsize + 1) / 2).astype("int32")
+
+            kernel_size[1] = np.ceil(wsize /
+                                     bins.astype("double")).astype("int32")
+            padding[1] = (
+                (kernel_size[1] * bins - wsize + 1) / 2).astype("int32")
+            out_level = self.pool2D_forward_naive(input, kernel_size,
+                                                  kernel_size, padding)
+            out_level_flatten.append(
+                out_level.reshape(nsize, bins * bins * csize))
+            if i == 0:
+                output = out_level_flatten[i]
+            else:
+                output = np.concatenate((output, out_level_flatten[i]), 1)
+        # output = np.concatenate(out_level_flatten.tolist(), 0);
+        self.inputs = {'X': input.astype('float32'), }
+        self.attrs = {
+            'pyramid_height': self.pyramid_height,
+            'pooling_type': self.pool_type
+        }
+
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        if self.pool_type != "avg":
+            self.check_grad(['X'], 'Out', max_relative_error=0.05)
+
+    def init_test_case(self):
+        self.shape = [3, 2, 4, 4]
+        self.pyramid_height = 3
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.pool_type = "max"
+
+
+class TestCase2(TestSppOp):
+    def init_test_case(self):
+        self.shape = [3, 2, 4, 4]
+        self.pyramid_height = 3
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.pool_type = "avg"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_squared_l2_distance_op.py b/python/paddle/v2/fluid/tests/test_squared_l2_distance_op.py
index dc6ebf5d30369231b4918a168bbdf25c7096c808..7b80d81d728197df7a089e35356e847234a8d3c9 100644
--- a/python/paddle/v2/fluid/tests/test_squared_l2_distance_op.py
+++ b/python/paddle/v2/fluid/tests/test_squared_l2_distance_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_squared_l2_norm_op.py b/python/paddle/v2/fluid/tests/test_squared_l2_norm_op.py
index 5a52c6a66c781672a483324083b97a3c5894f508..80994f5937ec09d754f8782128bb36214321a3c7 100644
--- a/python/paddle/v2/fluid/tests/test_squared_l2_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_squared_l2_norm_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import numpy as np
 import unittest
 from numpy import linalg as LA
diff --git a/python/paddle/v2/fluid/tests/test_sum_op.py b/python/paddle/v2/fluid/tests/test_sum_op.py
index 60254291e2ab9215e2bc37c12d5e2e1ca6d33d5d..366708ac839643949e36cebe29392f6d4b8d5e6a 100644
--- a/python/paddle/v2/fluid/tests/test_sum_op.py
+++ b/python/paddle/v2/fluid/tests/test_sum_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_tensor.py b/python/paddle/v2/fluid/tests/test_tensor.py
index 9f870d9eb3485aa0b54eb781b906f4232d12c49e..62a48b206cd3fb1adefe8a015cf3e2df17a779aa 100644
--- a/python/paddle/v2/fluid/tests/test_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_tensor.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import paddle.v2.fluid.core as core
 import unittest
 import numpy
diff --git a/python/paddle/v2/fluid/tests/test_tensor_array.py b/python/paddle/v2/fluid/tests/test_tensor_array.py
deleted file mode 100644
index d6929ba16e4dae0c57adcceb4f0e78c094eee55c..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/test_tensor_array.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import logging
-import paddle.v2.fluid.core as core
-import unittest
-import numpy as np
-
-
-class TestTensorArray(unittest.TestCase):
-    def setUp(self):
-        self.ta = core.TensorArray()
-
-        self.batch_size = 10
-        self.dim = 2
-
-        # create a LoDTensor
-        self.scope = core.Scope()
-        var = self.scope.var("test_tensor")
-        self.place = core.CPUPlace()
-        tensor = var.get_tensor()
-        tensor.set_dims([self.batch_size, self.dim])
-        tensor.alloc_float(self.place)
-        tensor_array = np.array(tensor)
-        tensor_array[0, 0] = 0
-        tensor_array[1, 0] = 1
-        tensor_array[2, 0] = 2
-        tensor_array[3, 0] = 3
-        tensor_array[4, 0] = 4
-        tensor_array[5, 0] = 5
-        tensor_array[6, 0] = 6
-        tensor_array[7, 0] = 7
-        tensor_array[8, 0] = 8
-        tensor_array[9, 0] = 9
-
-        lod_py = [[0, 2, 5, 10]]
-        lod_tensor = core.LoDTensor(lod_py)
-        lod_tensor.set(tensor_array, self.place)
-
-        self.py_seq_meta = [[5, 10, 2], [2, 5, 1], [0, 2, 0]]
-
-        self.tensor = lod_tensor
-
-    def test_unstack(self):
-        self.ta.unstack(self.tensor)
-        self.assertEqual(self.tensor.get_dims()[0], self.ta.size())
-
-    def test_read(self):
-        self.ta.unstack(self.tensor)
-        for i in range(self.batch_size):
-            tensor = self.ta.read(i)
-
-    def test_write(self):
-        self.ta.unstack(self.tensor)
-
-        # create a tensor with shape of [1, self.dim]
-        var = self.scope.var("hell")
-        tensor = var.get_tensor()
-        tensor.set_dims([1, self.dim])
-        tensor.alloc_float(self.place)
-        tensor_array = np.array(tensor)
-        for i in range(self.dim):
-            tensor_array[0, i] = i
-        tensor.set(tensor_array, self.place)
-
-        self.ta.write(2, tensor)
-
-        ta_tensor = self.ta.read(2)
-        ta_tensor_array = np.array(ta_tensor)
-        self.assertEqual(ta_tensor.get_dims(), [1, self.dim])
-        self.assertTrue((tensor_array == ta_tensor_array).all())
-
-    def test_write_shared(self):
-        self.ta.unstack(self.tensor)
-
-        # create a tensor with shape of [1, self.dim]
-        var = self.scope.var("hell")
-        tensor = var.get_tensor()
-        tensor.set_dims([1, self.dim])
-        tensor.alloc_float(self.place)
-        tensor_array = np.array(tensor)
-        for i in range(self.dim):
-            tensor_array[0, i] = i
-        tensor.set(tensor_array, self.place)
-
-        self.ta.write_shared(2, tensor)
-
-        ta_tensor = self.ta.read(2)
-        ta_tensor_array = np.array(ta_tensor)
-        self.assertEqual(ta_tensor.get_dims(), [1, self.dim])
-        self.assertTrue((tensor_array == ta_tensor_array).all())
-
-    def test_unpack(self):
-        meta = self.ta.unpack(self.tensor, 0, True)
-        self.assertEqual(self.ta.size(), 5)
-        self.assertEqual(meta, self.py_seq_meta)
-
-    def test_pack(self):
-        meta = self.ta.unpack(self.tensor, 0, True)
-        print "meta", meta
-        tensor = self.ta.pack(0, meta, self.tensor.lod())
-        print np.array(self.tensor)
-        print np.array(tensor)
-        self.assertTrue((np.array(self.tensor) == np.array(tensor)).all())
-        self.assertTrue(tensor.lod(), self.tensor.lod())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_top_k_op.py b/python/paddle/v2/fluid/tests/test_top_k_op.py
index 6e8fbefa6eafa391cdb5e17c882ee74b5bdc6507..86968dba140c4e0b6e94b9bf78255bcbb753434d 100644
--- a/python/paddle/v2/fluid/tests/test_top_k_op.py
+++ b/python/paddle/v2/fluid/tests/test_top_k_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_transpose_op.py b/python/paddle/v2/fluid/tests/test_transpose_op.py
index 9409cbaa00f792b60d5950556b869108aa732478..ff2541f450ca8ce374404aef65f1caa2d5f4f00b 100644
--- a/python/paddle/v2/fluid/tests/test_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_transpose_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_uniform_random_op.py b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
index f736dfb2e85552b321403c961da517f3b3efb100..332ac4f07ff936569c43aaab4a698f2b8e3fb985 100644
--- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
@@ -1,32 +1,62 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
+import numpy
+
 from paddle.v2.fluid.op import Operator
 import paddle.v2.fluid.core as core
-import numpy
+import paddle.v2.fluid as fluid
 
 
 class TestUniformRandomOp(unittest.TestCase):
-    def test_uniform_random_cpu(self):
+    def setUp(self):
+        self.op_type = "uniform_random"
+        self.inputs = {}
+        self.attrs = {
+            "shape": [1000, 784],
+            "min": -5.0,
+            "max": 10.0,
+            "seed": 10
+        }
+        self.outputs = ["Out"]
+
+    def test_cpu(self):
         self.uniform_random_test(place=core.CPUPlace())
 
-    def test_uniform_random_gpu(self):
+    def test_gpu(self):
         if core.is_compile_gpu():
-            self.uniform_random_test(place=core.GPUPlace(0))
+            self.uniform_random_test(place=core.CUDAPlace(0))
 
     def uniform_random_test(self, place):
-        scope = core.Scope()
-        scope.var('X').get_tensor()
-
-        op = Operator(
-            "uniform_random",
-            Out='X',
-            shape=[1000, 784],
-            min=-5.0,
-            max=10.0,
-            seed=10)
-
-        ctx = core.DeviceContext.create(place)
-        op.run(scope, ctx)
-        tensor = numpy.array(scope.find_var('X').get_tensor())
+        program = fluid.Program()
+        block = program.global_block()
+        vout = block.create_var(name="Out")
+        op = block.append_op(
+            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
+
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+
+        fetch_list = []
+        for var_name in self.outputs:
+            fetch_list.append(block.var(var_name))
+
+        exe = fluid.Executor(place)
+        outs = exe.run(program, fetch_list=fetch_list)
+
+        tensor = outs[0]
+
         self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1)
 
 
diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..988c0c75063f42f8865388979e14a00db3733400
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
@@ -0,0 +1,96 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
+    s0, s1, s2, s3 = input.shape
+    out_hsize = (s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
+    out_wsize = (s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
+    out = np.zeros((s0, s1, out_hsize, out_wsize))
+    for nidx in xrange(s0):
+        for cidx in xrange(s1):
+            for h in xrange(s2):
+                for w in xrange(s3):
+                    index = indices[nidx, cidx, h, w]
+                    hidx = (index - index % out_wsize) / out_wsize
+                    widx = index % out_wsize
+                    out[nidx, cidx, int(hidx), int(widx)] = \
+                            input[nidx, cidx, h, w]
+
+    return out
+
+
+class TestUnpoolOp(OpTest):
+    def setUp(self):
+        self.op_type = "unpool"
+        self.init_test_case()
+        pre_input = np.random.random(self.shape).astype("float32")
+        nsize, csize, hsize, wsize = pre_input.shape
+        hsize_out = (hsize - self.ksize[0] + 2 * self.paddings[0]) / \
+                self.strides[0] + 1
+        wsize_out = (wsize - self.ksize[1] + 2 * self.paddings[1]) / \
+                self.strides[1] + 1
+        input = np.zeros((nsize, csize, hsize_out, wsize_out))
+        indices = np.zeros((nsize, csize, hsize_out, wsize_out))
+        for i in xrange(hsize_out):
+            for j in xrange(wsize_out):
+                r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
+                r_end = np.min((i * self.strides[0] + self.ksize[0] - \
+                        self.paddings[0], hsize))
+                c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
+                c_end = np.min((j * self.strides[1] + self.ksize[1] - \
+                        self.paddings[1], wsize))
+                for nidx in xrange(nsize):
+                    for cidx in xrange(csize):
+                        x_masked = pre_input[nidx, cidx, r_start:r_end, \
+                                c_start:c_end]
+                        input[nidx, cidx, i, j] = x_masked.max()
+                        arg = x_masked.argmax()
+                        indices[nidx, cidx, i, j] = \
+                                (r_start + arg / self.ksize[1]) * wsize + \
+                                c_start + arg % self.ksize[1]
+        output = self.unpool2d_forward_naive(input, indices, self.ksize, \
+                self.strides, self.paddings).astype("float32")
+        self.inputs = {
+            'X': input.astype('float32'),
+            'Indices': indices.astype('int32')
+        }
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'unpooling_type': self.unpooling_type,
+        }
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def init_test_case(self):
+        self.unpool2d_forward_naive = unpool2dmax_forward_naive
+        self.unpooling_type = "max"
+        self.shape = [6, 4, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [2, 2]
+        self.paddings = [0, 0]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_variable.py b/python/paddle/v2/fluid/tests/test_variable.py
index c3e1f9ac0a70e7448fd8d1983b1c04d27af9771c..199fd4a8c26fc2ebcd50655fd2ad131b08383071 100644
--- a/python/paddle/v2/fluid/tests/test_variable.py
+++ b/python/paddle/v2/fluid/tests/test_variable.py
@@ -1,5 +1,18 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
-from paddle.v2.fluid.framework import g_main_program, Program, convert_np_dtype_to_dtype_
+from paddle.v2.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
 import paddle.v2.fluid.core as core
 import numpy as np
 
@@ -18,17 +31,17 @@ class TestVariable(unittest.TestCase):
         self.assertRaises(ValueError, lambda: convert("int8"))
 
     def test_var(self):
-        b = g_main_program.current_block()
+        b = default_main_program().current_block()
         w = b.create_var(
             dtype="float64", shape=[784, 100], lod_level=0, name="fc.w")
         self.assertNotEqual(str(w), "")
-        self.assertEqual(core.DataType.FP64, w.data_type)
+        self.assertEqual(core.DataType.FP64, w.dtype)
         self.assertEqual((784, 100), w.shape)
         self.assertEqual("fc.w", w.name)
         self.assertEqual(0, w.lod_level)
 
         w = b.create_var(name='fc.w')
-        self.assertEqual(core.DataType.FP64, w.data_type)
+        self.assertEqual(core.DataType.FP64, w.dtype)
         self.assertEqual((784, 100), w.shape)
         self.assertEqual("fc.w", w.name)
         self.assertEqual(0, w.lod_level)
diff --git a/python/paddle/v2/fluid/tests/test_warpctc_op.py b/python/paddle/v2/fluid/tests/test_warpctc_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f565676c5af1685704681758a8590ecb6f59026
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_warpctc_op.py
@@ -0,0 +1,231 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import sys
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_softmax_op import stable_softmax
+
+CUDA_BLOCK_SIZE = 512
+
+
+class CTCForward(object):
+    def __init__(self, softmax, softmax_lod, labels, labels_lod, blank,
+                 norm_by_times):
+        self.softmax = softmax
+        self.softmax_lod = softmax_lod
+        assert labels.shape[1] == 1
+        self.labels = labels
+        self.labels_lod = labels_lod
+        self.blank = blank
+        self.norm_by_times = norm_by_times
+
+        self.level = 0
+        self.num_classes = softmax.shape[1]
+        self.batch_size = len(softmax_lod[self.level]) - 1
+        assert self.batch_size == len(labels_lod[self.level]) - 1
+
+        self.loss = np.zeros([self.batch_size, 1], dtype="float32")
+        self.gradient = np.zeros(self.softmax.shape, dtype="float32")
+
+        # float64
+        self.EXP_MAX = sys.float_info.max
+        self.EXP_MIN = sys.float_info.min
+        self.LOG_ZERO = np.log(self.EXP_MIN)
+        self.LOG_INFINITY = np.log(self.EXP_MAX)
+
+    def safe_exp(self, x):
+        if x <= self.LOG_ZERO:
+            return 0.0
+        if x >= self.LOG_INFINITY:
+            return self.EXP_MAX
+        return np.exp(x)
+
+    def safe_log(self, x):
+        if x <= self.EXP_MIN:
+            return self.LOG_ZERO
+        return np.log(x)
+
+    # x = lna and y = lnb are in log scale, ln(a / b) = lna - lnb
+    def log_div(self, x, y):
+        res = x - y
+        if res <= self.LOG_ZERO:
+            return self.LOG_ZERO
+        if res >= self.LOG_INFINITY:
+            return self.LOG_INFINITY
+        return res
+
+    # x = lna and y = lnb are in log scale, ln(a * b) = lna + lnb
+    def log_mul(self, x, y):
+        res = x + y
+        if res <= self.LOG_ZERO:
+            return self.LOG_ZERO
+        if res >= self.LOG_INFINITY:
+            return self.LOG_INFINITY
+        return res
+
+    # x = lna and y = lnb are in log scale,
+    # ln(a + b) = lna + ln(1 + exp(lnb - lna)), where b > a
+    def log_add(self, x, y):
+        if x < y:
+            t = y
+            y = x
+            x = t
+        return x + self.safe_log(1 + self.safe_exp(y - x))
+
+    def segment_range(self, time, total_times, total_segments):
+        start = max(0, total_segments - (2 * (total_times - time)))
+        end = min(total_segments, 2 * (time + 1))
+        return start, end
+
+    def forward_a_sequence(self, softmax_a_sequence, labels_a_sequence):
+        total_times = softmax_a_sequence.shape[0]
+        total_segments = labels_a_sequence.shape[0] * 2 + 1
+
+        required_times = labels_a_sequence.shape[0]
+        old_label = -1
+        for i in range(labels_a_sequence.shape[0]):
+            # two contingous labels with the same value
+            if labels_a_sequence[i, 0] == old_label:
+                required_times = required_times + 1
+            old_label = labels_a_sequence[i, 0]
+
+        if total_times < required_times:
+            return 0
+
+        # calculate the forward and backward variables,
+        # reference Chapter 7.3 of "Alex Grave, Supervised Sequence
+        # Labelling with Recurrent Neural Networks"
+        log_acts = np.zeros([total_times, self.num_classes], dtype="float32")
+        for i in range(total_times):
+            for j in range(self.num_classes):
+                log_acts[i, j] = self.safe_log(softmax_a_sequence[i, j])
+
+        # calculate the forward variables
+        forward_vars = np.zeros([total_times, total_segments], dtype="float32")
+        for i in range(total_times):
+            for j in range(total_segments):
+                forward_vars[i, j] = self.LOG_ZERO
+
+        for i in range(total_times):
+            # dp initialization at t0
+            if i == 0:
+                forward_vars[i, 0] = log_acts[0, self.blank]
+                if total_segments > 1:
+                    forward_vars[i, 1] = log_acts[0, labels_a_sequence[i, 0]]
+                continue
+
+            # dp from t1
+            start, end = self.segment_range(i, total_times, total_segments)
+            for k in range(end - start):
+                j = k + start
+                if j & 1 == 1:
+                    label_idx = j / 2
+                    label_val = labels_a_sequence[label_idx, 0]
+                    fv = self.log_add(forward_vars[i - 1, j],
+                                      forward_vars[i - 1, j - 1])
+                    if j > 1 and label_val != labels_a_sequence[label_idx - 1,
+                                                                0]:
+                        fv = self.log_add(fv, forward_vars[i - 1, j - 2])
+                    fv = self.log_mul(fv, log_acts[i, label_val])
+                else:
+                    fv = forward_vars[i - 1, j]
+                    if j > 0:
+                        fv = self.log_add(fv, forward_vars[i - 1, j - 1])
+                    fv = self.log_mul(fv, log_acts[i, self.blank])
+                forward_vars[i, j] = fv
+
+        # sum the last two value as log_prob
+        log_prob = forward_vars[total_times - 1, total_segments - 1]
+        if total_segments > 1:
+            log_prob = self.log_add(
+                log_prob, forward_vars[total_times - 1, total_segments - 2])
+
+        return -log_prob
+
+    def forward(self):
+        for i in range(self.batch_size):
+            softmax_start_i = self.softmax_lod[self.level][i]
+            softmax_end_i = self.softmax_lod[self.level][i + 1]
+            labels_start_i = self.labels_lod[self.level][i]
+            labels_end_i = self.labels_lod[self.level][i + 1]
+
+            softmax_a_sequence = self.softmax[softmax_start_i:softmax_end_i, :]
+            labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
+            self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
+                                                   labels_a_sequence)
+        return self.loss
+
+
+class TestWarpCTCOp(OpTest):
+    def config(self):
+        self.batch_size = 4
+        self.num_classes = 8
+        self.logits_lod = [[0, 4, 5, 8, 11]]
+        self.labels_lod = [[0, 3, 4, 8, 12]]
+        self.blank = self.num_classes - 1
+        self.norm_by_times = False
+
+    def setUp(self):
+        self.op_type = "warpctc"
+        self.config()
+
+        logits = np.random.uniform(
+            0.1, 1.0,
+            [self.logits_lod[0][-1], self.num_classes]).astype("float32")
+        softmax = np.apply_along_axis(stable_softmax, 1, logits)
+        # labels should not be blank
+        labels = np.random.randint(
+            0, self.num_classes - 1, [self.labels_lod[0][-1], 1], dtype="int32")
+
+        ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
+                         self.blank, self.norm_by_times)
+        loss = ctc.forward()
+
+        max_sequence_length = 0
+        for i in range(self.batch_size):
+            max_sequence_length = max(
+                max_sequence_length,
+                self.logits_lod[0][i + 1] - self.logits_lod[0][i])
+        self.gradient = np.zeros(
+            [max_sequence_length, self.batch_size, self.num_classes],
+            dtype="float32")
+
+        self.inputs = {
+            "Logits": (logits, self.logits_lod),
+            "Label": (labels, self.labels_lod)
+        }
+        self.outputs = {"Loss": loss}
+        self.attrs = {"blank": self.blank, "norm_by_times": self.norm_by_times}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.outputs['WarpCTCGrad'] = self.gradient
+        self.check_grad(["Logits"], "Loss", max_relative_error=0.007)
+
+
+class TestWarpCTCOpCase1(TestWarpCTCOp):
+    def config(self):
+        self.batch_size = 4
+        self.num_classes = CUDA_BLOCK_SIZE + 2
+        self.logits_lod = [[0, 4, 5, 8, 11]]
+        self.labels_lod = [[0, 3, 4, 8, 12]]
+        self.blank = 0
+        self.norm_by_times = False
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_while_op.py b/python/paddle/v2/fluid/tests/test_while_op.py
index 84b432333f950f754a97bc1a051b59c16fb22aed..72de0a03612053ba3a07526726a65606275dc61c 100644
--- a/python/paddle/v2/fluid/tests/test_while_op.py
+++ b/python/paddle/v2/fluid/tests/test_while_op.py
@@ -1,19 +1,32 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.core as core
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy
 
 
 class TestWhileOp(unittest.TestCase):
     def test_simple_forward(self):
         d0 = layers.data(
-            "d0", shape=[10], append_batch_size=False, data_type='float32')
+            "d0", shape=[10], append_batch_size=False, dtype='float32')
         d1 = layers.data(
-            "d1", shape=[10], append_batch_size=False, data_type='float32')
+            "d1", shape=[10], append_batch_size=False, dtype='float32')
         d2 = layers.data(
-            "d2", shape=[10], append_batch_size=False, data_type='float32')
+            "d2", shape=[10], append_batch_size=False, dtype='float32')
         i = layers.zeros(shape=[1], dtype='int64')
         i.stop_gradient = True
         init = layers.zeros(shape=[10], dtype='float32')
@@ -46,7 +59,7 @@ class TestWhileOp(unittest.TestCase):
         sum_result = layers.array_read(array=mem_array, i=i)
         loss = layers.mean(x=sum_result)
 
-        append_backward_ops(loss)
+        append_backward(loss)
 
         cpu = core.CPUPlace()
         exe = Executor(cpu)
@@ -55,19 +68,10 @@ class TestWhileOp(unittest.TestCase):
         for i in xrange(3):
             d.append(numpy.random.random(size=[10]).astype('float32'))
 
-        d_tensor = []
-        for item in d:
-            t = core.LoDTensor()
-            t.set(item, cpu)
-            d_tensor.append(t)
-
-        outs = map(numpy.array,
-                   exe.run(feed={
-                       'd0': d_tensor[0],
-                       'd1': d_tensor[1],
-                       'd2': d_tensor[2]
-                   },
-                           fetch_list=[sum_result]))
+        outs = exe.run(feed={'d0': d[0],
+                             'd1': d[1],
+                             'd2': d[2]},
+                       fetch_list=[sum_result])
         self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
 
 
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
index 7408ea8ef611ddfa74dc5bb6ef45d4e0ccb9d141..a6fa0cecb87e86e804815012885678a9fc557d95 100644
--- a/python/paddle/v2/image.py
+++ b/python/paddle/v2/image.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 """
 This file contains some common interfaces for image preprocess.
 Many users are confused about the image layout. We introduce
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index 9148cb56cf78e1ebb994f4a4a34d4a1b6e2e6ef4..39d1bfff0c8659bb87b9b97334f377639cea9c59 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import numpy
 import collections
 import topology
diff --git a/python/paddle/v2/master/__init__.py b/python/paddle/v2/master/__init__.py
index c8975b5d4a33cbecb4fa5a144bc610c36591d629..09daaaa75e01969ded25dcc848df3f7b9202124e 100644
--- a/python/paddle/v2/master/__init__.py
+++ b/python/paddle/v2/master/__init__.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from client import *
 
 __all__ = ['client']
diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py
index fc718f031e2267e737adbc340226e145bf614bf2..b874c2f349094c0c0ab9e3663fcc7491f1edd236 100644
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import ctypes
 import os
 
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index bd97dc1199fedc8ac91c1c6086957e8cce88bdc4..7b7d1a1d1672802e0e91a857100604758683224e 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -383,19 +383,22 @@ class Parameters(object):
             params.deserialize(param_name, f)
         return params
 
-    def init_from_tar(self, f):
+    def init_from_tar(self, f, exclude_params=[]):
         """
         Different from `from_tar`, this interface can be used to
         init partial network parameters from another saved model.
 
         :param f: the initialized model file.
         :type f: tar file
+        :param exclude_params: the names of parameters that should  
+            not be initialized from the model file.
+        :type exclude_params: list of strings
         :return: Nothing.
         """
 
         tar_param = Parameters.from_tar(f)
         for pname in tar_param.names():
-            if pname in self.names():
+            if pname in self.names() and pname not in exclude_params:
                 self.set(pname, tar_param.get(pname))
 
 
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
index 45a4288751e37b99dd1005ec78f30a98044926ff..44a6e344630bb35d28ee29078bf8727053a24bef 100644
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
@@ -14,13 +14,16 @@
 
 __all__ = [
     'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn', 'xmap_readers'
+    'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader'
 ]
 
+from threading import Thread
+import subprocess
+
+from Queue import Queue
 import itertools
 import random
-from Queue import Queue
-from threading import Thread
+import zlib
 
 
 def map_readers(func, *readers):
@@ -323,3 +326,80 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
                 yield sample
 
     return xreader
+
+
+def _buf2lines(buf, line_break="\n"):
+    # FIXME: line_break should be automatically configured.
+    lines = buf.split(line_break)
+    return lines[:-1], lines[-1]
+
+
+class PipeReader:
+    """
+        PipeReader read data by stream from a command, take it's 
+        stdout into a pipe buffer and redirect it to the parser to
+        parse, then yield data as your desired format.
+
+        You can using standard linux command or call another program
+        to read data, from HDFS, Ceph, URL, AWS S3 etc:
+
+        .. code-block:: python
+           cmd = "hadoop fs -cat /path/to/some/file"
+           cmd = "cat sample_file.tar.gz"
+           cmd = "curl http://someurl"
+           cmd = "python print_s3_bucket.py"
+
+        An example:
+
+        .. code-block:: python
+    
+           def example_reader():
+               for f in myfiles:
+                   pr = PipeReader("cat %s"%f)
+                   for l in pr.get_line():
+                       sample = l.split(" ")
+                       yield sample
+    """
+
+    def __init__(self, command, bufsize=8192, file_type="plain"):
+        if not isinstance(command, str):
+            raise TypeError("left_cmd must be a string")
+        if file_type == "gzip":
+            self.dec = zlib.decompressobj(
+                32 + zlib.MAX_WBITS)  # offset 32 to skip the header
+        self.file_type = file_type
+        self.bufsize = bufsize
+        self.process = subprocess.Popen(
+            command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
+
+    def get_line(self, cut_lines=True, line_break="\n"):
+        """
+        :param cut_lines: cut buffer to lines
+        :type cut_lines: bool
+        :param line_break: line break of the file, like \n or \r
+        :type line_break: string
+
+        :return: one line or a buffer of bytes
+        :rtype: string
+        """
+        remained = ""
+        while True:
+            buff = self.process.stdout.read(self.bufsize)
+            if buff:
+                if self.file_type == "gzip":
+                    decomp_buff = self.dec.decompress(buff)
+                elif self.file_type == "plain":
+                    decomp_buff = buff
+                else:
+                    raise TypeError("file_type %s is not allowed" %
+                                    self.file_type)
+
+                if cut_lines:
+                    lines, remained = _buf2lines(''.join(
+                        [remained, decomp_buff]), line_break)
+                    for line in lines:
+                        yield line
+                else:
+                    yield decomp_buff
+            else:
+                break
diff --git a/python/paddle/v2/reader/tests/__init__.py b/python/paddle/v2/reader/tests/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..2619c1c0e9db17c38ccc6e1dd010bd9c1c5966bd 100644
--- a/python/paddle/v2/reader/tests/__init__.py
+++ b/python/paddle/v2/reader/tests/__init__.py
@@ -0,0 +1,13 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py
index 5a92951b100fa51ab6df7039d9c6b54d1f9d963e..4ba71969dffe7447b6c5b70aeb752a4e5469fb36 100644
--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ b/python/paddle/v2/reader/tests/decorator_test.py
@@ -145,5 +145,33 @@ class TestXmap(unittest.TestCase):
                             self.assertEqual(e, mapper(idx))
 
 
+class TestPipeReader(unittest.TestCase):
+    def test_pipe_reader(self):
+        def example_reader(myfiles):
+            for f in myfiles:
+                pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128)
+                for l in pr.get_line():
+                    yield l
+
+        import tempfile
+
+        records = [str(i) for i in xrange(5)]
+        temp = tempfile.NamedTemporaryFile()
+        try:
+            with open(temp.name, 'w') as f:
+                for r in records:
+                    f.write('%s\n' % r)
+
+            result = []
+            for r in example_reader([temp.name]):
+                result.append(r)
+
+            for idx, e in enumerate(records):
+                self.assertEqual(e, result[idx])
+        finally:
+            # delete the temporary file
+            temp.close()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/tests/test_parameters.py b/python/paddle/v2/tests/test_parameters.py
index 7ba8a939fbd1a949d61a007b40c054e7543c0cbc..ab6863620feacd4db7f83eec976811ab0097f5e7 100644
--- a/python/paddle/v2/tests/test_parameters.py
+++ b/python/paddle/v2/tests/test_parameters.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import sys
 
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index db01ab7374eca18b6063dc634da5ef83c4bc9adc..1a70a7203b7e358ce654621d686319179b511249 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 """
 Module Trainer
 """
diff --git a/python/setup.py.in b/python/setup.py.in
index fe91df10daf303bb14d1e5f28817984d261e0880..65ec58ecf98e693ecf02922129f6eef13cbe5303 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,8 +1,66 @@
 from setuptools import setup, Distribution, Extension
+import subprocess
 class BinaryDistribution(Distribution):
     def has_ext_modules(foo):
         return True
 
+MAJOR   = 0
+MINOR   = 11
+PATCH   = 0
+RC      = 0
+ISTAGED = False
+
+
+
+def git_commit():
+    try:
+        cmd = ['git', 'rev-parse', 'HEAD']
+        git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
+    except:
+        git_commit = 'Unknown'
+    return git_commit
+
+def write_version_py(filename='paddle/version.py'):
+    cnt = '''
+# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
+#
+full_version    = '%(major)d.%(minor)d.%(patch)d'
+major           = '%(major)d'
+minor           = '%(minor)d'
+patch           = '%(patch)d'
+rc              = '%(rc)d'
+istaged         = %(istaged)s
+commit          = '%(commit)s'
+with_mkl        = '%(with_mkl)s'
+
+def show():
+    if istaged:
+        print 'full_version:', full_version
+        print 'major:', major
+        print 'minor:', minor
+        print 'patch:', patch
+        print 'rc:', rc
+    else:
+        print 'commit:', commit
+
+def mkl():
+    return with_mkl
+'''
+    commit = git_commit()
+    with open(filename, 'w') as f:
+        f.write(cnt % {
+            'major': MAJOR,
+            'minor': MINOR,
+            'patch': PATCH,
+            'rc': RC,
+            'version': '${PADDLE_VERSION}',
+            'commit': commit,
+            'istaged': ISTAGED,
+            'with_mkl': '@WITH_MKL@'})
+
+write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
+
+
 packages=['paddle',
           'paddle.proto',
           'paddle.trainer',
@@ -15,18 +73,18 @@ packages=['paddle',
           'paddle.v2.plot',
           'paddle.v2.fluid',
           'paddle.v2.fluid.proto',
+          'paddle.v2.fluid.layers',
           'py_paddle']
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
     setup_requires = f.read().splitlines()
 
 if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
-    setup_requires+=["opencv-python"]
+    setup_requires+=['opencv-python']
 
 # the prefix is sys.prefix which should always be usr
 paddle_bin_dir = 'opt/paddle/bin'
-paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle_usage',
-               '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
+paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
                '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
                '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main',
                '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
@@ -36,7 +94,7 @@ paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
 if '${MKL_SHARED_LIBS}'!= '':
   paddle_rt_libs += '${MKL_SHARED_LIBS}'.split(';')
 
-setup(name='paddlepaddle',
+setup(name='${PACKAGE_NAME}',
       version='${PADDLE_VERSION}',
       description='Parallel Distributed Deep Learning',
       install_requires=setup_requires,
diff --git a/tools/manylinux1/Dockerfile.android b/tools/manylinux1/Dockerfile.android
new file mode 100644
index 0000000000000000000000000000000000000000..b6cae228a0c45ab70ba8ecc80ae4df7e0fa5bdbc
--- /dev/null
+++ b/tools/manylinux1/Dockerfile.android
@@ -0,0 +1,55 @@
+FROM ubuntu:16.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+ARG UBUNTU_MIRROR
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
+# ENV variables
+ARG ANDROID_ABI
+ARG ANDROID_API
+
+ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
+ENV ANDROID_API=${ANDROID_API:-21}
+
+ENV HOME=/root \
+    ANDROID_NDK_HOME=/opt/android-ndk-linux \
+    ANDROID_TOOLCHAINS_DIR=/opt/toolchains
+
+RUN apt-get update && \
+    apt-get install -y \
+    git python-dev python-pip python-numpy \
+    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
+    apt-get clean -y
+
+# Install Go and glide
+RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+RUN pip install --upgrade pip && \
+    pip install -U 'protobuf==3.1.0' && \
+    pip install -U wheel sphinx && \
+    pip install pre-commit
+
+# Android NDK
+RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
+    mkdir -p /opt/android-ndk-tmp && \
+    cd /opt/android-ndk-tmp && \
+    wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
+    unzip -q android-ndk-r14b-linux-x86_64.zip && \
+    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
+    rm -rf /opt/android-ndk-tmp
+
+CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
+
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
new file mode 100644
index 0000000000000000000000000000000000000000..2c6ba650a5d7996bef212e88a16f2a159ca377e7
--- /dev/null
+++ b/tools/manylinux1/Dockerfile.x64
@@ -0,0 +1,54 @@
+# NOTE The manylinux1 policy mandates CentOS-5. We replace it with CentOS-6 in
+# order to satisfy the build of capnproto library (a nupic.core dependency),
+# which requires some headers and symbols not present on CentOS-5 (e.g.,
+# signalfd.h, pipe2, O_NONBLOCK, SOCK_NONBLOCK, etc.). See
+# https://github.com/sandstorm-io/capnproto/issues/350.
+FROM nvidia/cuda:<baseimg>
+MAINTAINER Numenta, based on the ManyLinux project
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
+ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
+
+COPY build_scripts /build_scripts
+RUN bash build_scripts/build.sh && rm -r build_scripts
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+# for paddle
+RUN wget --no-check-certificate -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+
+
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+ENV PATH=${GOROOT}/bin:${GOPATH}/bin:${PATH}
+
+# protobuf 3.1.0
+RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf/releases/download/v3.1.0/protobuf-cpp-3.1.0.tar.gz && \
+    tar xzf protobuf-cpp-3.1.0.tar.gz && \
+    cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz
+
+
+RUN yum install -y sqlite-devel zlib-devel openssl-devel boost boost-devel pcre-devel vim tk-devel tkinter libtool
+
+RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt
+
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
+    go get github.com/Masterminds/glide && \
+    rm -rf /root/requirements.txt
+
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python
+
+RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
+    cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
+
+RUN mkdir -p /src && cd /src && git clone https://github.com/NVIDIA/nccl.git nccl && cd nccl &&\
+    make -j `nproc` install <NCCL_MAKE_OPTS>  && cd .. && rm -rf nccl
diff --git a/tools/manylinux1/README.md b/tools/manylinux1/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cb0a9ac22cda6fb6f585ab8fd95179573c760f28
--- /dev/null
+++ b/tools/manylinux1/README.md
@@ -0,0 +1,30 @@
+# buildtools
+
+We release PaddlePaddle and PaddlePaddle Fluid as shared libraries,
+which, we hope could be released as wheel packages on PyPI, so we need
+to make sure that the build follows the
+[manulinux1](https://www.python.org/dev/peps/pep-0513/) standard.
+
+The manylinux standard suggests building Python modules on an old
+system, because that a module would anyway depend on some shared
+libraries, and Linux's shared library standard states that those built
+with newer version compilers cannot work with those with older
+versions.  The suggested building environment is as old as CentOS 5.
+However, PaddlePaddle relies on CUDA, and the earlies version of
+[CentOS works with CUDA is 6](https://hub.docker.com/r/nvidia/cuda/).
+So, here we provide a Docker image basing on CentOS 6 and CUDA for
+building PaddlePaddle and making the release supports "as-manylinux as
+possible."  or "sufficiently many Linux" according to [this
+discussion](https://mail.python.org/pipermail/wheel-builders/2016-July/000175.html).
+
+The build output of our Docker image includes multiple wheel files --
+some contain the CPU-only binary, some others support CUDA; some are
+compatible with the cp27m Python ABI, some others with cp27.
+
+To build these wheels, please run the following commands:
+
+```bash
+git clone https://github.com/paddlepaddle/paddle
+cd paddle/tools/manylinux1
+REPO=[yourrepo] ./build_all.sh
+```
diff --git a/tools/manylinux1/build_all.sh b/tools/manylinux1/build_all.sh
new file mode 100755
index 0000000000000000000000000000000000000000..097bedb5265d00f8aa362bb0272af633c97192ba
--- /dev/null
+++ b/tools/manylinux1/build_all.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+set -xe
+
+REPO="${REPO:-typhoon1986}"
+
+# NOTE: version matches are determined!
+sed 's/<baseimg>/7.5-cudnn5-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52"/g'> Dockerfile.tmp
+docker build -t ${REPO}/paddle_manylinux_devel:cuda7.5_cudnn5 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda7.5_cudnn5
+
+sed 's/<baseimg>/8.0-cudnn5-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62"/g'> Dockerfile.tmp
+docker build -t ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn5 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn5
+
+sed 's/<baseimg>/8.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62"/g'> Dockerfile.tmp
+
+docker build -t ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn7 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn7
+
+sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70"/g'> Dockerfile.tmp
+docker build -t ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7
diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..93591fa9ddad8a78df344e1e912a5f1c7e93dfa4
--- /dev/null
+++ b/tools/manylinux1/build_scripts/build.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+# Top-level build script called from Dockerfile
+
+# Stop at any error, show all commands
+set -ex
+
+# Python versions to be installed in /opt/$VERSION_NO
+# NOTE Only need python 2.7.11 for nupic.core/nupic.bindings at this time, so
+# remove others to expedite build and reduce docker image size. The original
+# manylinux docker image project builds many python versions.
+# NOTE We added back 3.5.1, since auditwheel requires python 3.3+
+CPYTHON_VERSIONS="2.7.11 3.5.1"
+
+# openssl version to build, with expected sha256 hash of .tar.gz
+# archive
+OPENSSL_ROOT=openssl-1.0.2l
+OPENSSL_HASH=ce07195b659e75f4e1db43552860070061f156a98bb37b672b101ba6e3ddf30c
+EPEL_RPM_HASH=e5ed9ecf22d0c4279e92075a64c757ad2b38049bcf5c16c4f2b75d5f6860dc0d
+DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
+PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
+CURL_ROOT=curl-7.49.1
+CURL_HASH=eb63cec4bef692eab9db459033f409533e6d10e20942f4b060b32819e81885f1
+AUTOCONF_ROOT=autoconf-2.69
+AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
+
+# Dependencies for compiling Python that we want to remove from
+# the final image after compiling Python
+PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel"
+
+# Libraries that are allowed as part of the manylinux1 profile
+MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
+
+# Get build utilities
+MY_DIR=$(dirname "${BASH_SOURCE[0]}")
+source $MY_DIR/build_utils.sh
+
+# EPEL support
+yum -y install wget curl
+curl -sLO https://dl.fedoraproject.org/pub/epel/6/x86_64/epel-release-6-8.noarch.rpm
+check_sha256sum epel-release-6-8.noarch.rpm $EPEL_RPM_HASH
+
+# Dev toolset (for LLVM and other projects requiring C++11 support)
+curl -sLO http://people.centos.org/tru/devtools-2/devtools-2.repo
+check_sha256sum devtools-2.repo $DEVTOOLS_HASH
+mv devtools-2.repo /etc/yum.repos.d/devtools-2.repo
+rpm -Uvh --replacepkgs epel-release-6*.rpm
+rm -f epel-release-6*.rpm
+
+# Development tools and libraries
+yum -y install bzip2 make git patch unzip bison yasm diffutils \
+    automake which file \
+    kernel-devel-`uname -r` \
+    devtoolset-2-binutils devtoolset-2-gcc \
+    devtoolset-2-gcc-c++ devtoolset-2-gcc-gfortran \
+    ${PYTHON_COMPILE_DEPS}
+
+# Install more recent version of cmake
+# curl -O https://cmake.org/files/v3.8/cmake-3.8.1-Linux-x86_64.sh
+# /bin/sh cmake-3.8.1-Linux-x86_64.sh --prefix=/usr/local --skip-license
+# rm cmake-3.8.1-Linux-x86_64.sh
+
+wget -q https://cmake.org/files/v3.5/cmake-3.5.2.tar.gz && tar xzf cmake-3.5.2.tar.gz && \
+cd cmake-3.5.2 && ./bootstrap && \
+make -j4 && make install && cd .. && rm cmake-3.5.2.tar.gz
+
+
+# Install newest autoconf
+build_autoconf $AUTOCONF_ROOT $AUTOCONF_HASH
+autoconf --version
+
+# Compile the latest Python releases.
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+build_openssl $OPENSSL_ROOT $OPENSSL_HASH
+mkdir -p /opt/python
+build_cpythons $CPYTHON_VERSIONS
+
+PY35_BIN=/opt/python/cp35-cp35m/bin
+# NOTE Since our custom manylinux image builds pythons with shared
+# libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
+# python.
+ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}"
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib"
+
+# Our openssl doesn't know how to find the system CA trust store
+#   (https://github.com/pypa/manylinux/issues/53)
+# And it's not clear how up-to-date that is anyway
+# So let's just use the same one pip and everyone uses
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install certifi
+ln -s $($PY35_BIN/python -c 'import certifi; print(certifi.where())') \
+      /opt/_internal/certs.pem
+# If you modify this line you also have to modify the versions in the
+# Dockerfiles:
+export SSL_CERT_FILE=/opt/_internal/certs.pem
+
+# Install newest curl
+build_curl $CURL_ROOT $CURL_HASH
+rm -rf /usr/local/include/curl /usr/local/lib/libcurl* /usr/local/lib/pkgconfig/libcurl.pc
+hash -r
+curl --version
+curl-config --features
+
+# Now we can delete our built SSL
+rm -rf /usr/local/ssl
+
+# Install patchelf (latest with unreleased bug fixes)
+curl -sLO https://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
+check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
+tar -xzf patchelf-0.9njs2.tar.gz
+(cd patchelf-0.9njs2 && ./configure && make && make install)
+rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
+
+# Install latest pypi release of auditwheel
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
+ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel
+
+# Clean up development headers and other unnecessary stuff for
+# final image
+yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
+    avahi freetype bitstream-vera-fonts \
+    ${PYTHON_COMPILE_DEPS}  > /dev/null 2>&1
+yum -y install ${MANYLINUX1_DEPS}
+yum -y clean all > /dev/null 2>&1
+yum list installed
+# we don't need libpython*.a, and they're many megabytes
+find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f
+# Strip what we can -- and ignore errors, because this just attempts to strip
+# *everything*, including non-ELF files:
+find /opt/_internal -type f -print0 \
+    | xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true
+# We do not need the Python test suites, or indeed the precompiled .pyc and
+# .pyo files. Partially cribbed from:
+#    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile
+find /opt/_internal \
+     \( -type d -a -name test -o -name tests \) \
+  -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \
+  -print0 | xargs -0 rm -f
+
+for PYTHON in /opt/python/*/bin/python; do
+    # Add matching directory of libpython shared library to library lookup path
+    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib"
+
+    # Smoke test to make sure that our Pythons work, and do indeed detect as
+    # being manylinux compatible:
+    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/manylinux1-check.py
+    # Make sure that SSL cert checking works
+    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/ssl-check.py
+done
+
+# Restore LD_LIBRARY_PATH
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}"
diff --git a/tools/manylinux1/build_scripts/build_utils.sh b/tools/manylinux1/build_scripts/build_utils.sh
new file mode 100755
index 0000000000000000000000000000000000000000..10422ae3bd00f4e0dd059af0384f8cc17e4b7855
--- /dev/null
+++ b/tools/manylinux1/build_scripts/build_utils.sh
@@ -0,0 +1,173 @@
+#!/bin/bash
+# Helper utilities for build
+
+PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
+# XXX: the official https server at www.openssl.org cannot be reached
+# with the old versions of openssl and curl in Centos 5.11 hence the fallback
+# to the ftp mirror:
+# OPENSSL_DOWNLOAD_URL=ftp://ftp.openssl.org/source
+OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source
+# Ditto the curl sources
+CURL_DOWNLOAD_URL=http://curl.askapache.com/download
+
+GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
+
+AUTOCONF_DOWNLOAD_URL=http://ftp.gnu.org/gnu/autoconf
+
+
+function check_var {
+    if [ -z "$1" ]; then
+        echo "required variable not defined"
+        exit 1
+    fi
+}
+
+
+function lex_pyver {
+    # Echoes Python version string padded with zeros
+    # Thus:
+    # 3.2.1 -> 003002001
+    # 3     -> 003000000
+    echo $1 | awk -F "." '{printf "%03d%03d%03d", $1, $2, $3}'
+}
+
+
+function do_cpython_build {
+    local py_ver=$1
+    check_var $py_ver
+    local ucs_setting=$2
+    check_var $ucs_setting
+    tar -xzf Python-$py_ver.tgz
+    pushd Python-$py_ver
+    if [ "$ucs_setting" = "none" ]; then
+        unicode_flags=""
+        dir_suffix=""
+    else
+        local unicode_flags="--enable-unicode=$ucs_setting"
+        local dir_suffix="-$ucs_setting"
+    fi
+    local prefix="/opt/_internal/cpython-${py_ver}${dir_suffix}"
+    mkdir -p ${prefix}/lib
+    # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
+
+    # NOTE --enable-shared for generating libpython shared library needed for
+    # linking of some of the nupic.core test executables.
+    CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
+    make -j2 > /dev/null
+    make install > /dev/null
+    popd
+    echo "ZZZ looking for libpython"
+    find / -name 'libpython*.so*'
+    rm -rf Python-$py_ver
+    # Some python's install as bin/python3. Make them available as
+    # bin/python.
+    if [ -e ${prefix}/bin/python3 ]; then
+        ln -s python3 ${prefix}/bin/python
+    fi
+    # NOTE Make libpython shared library visible to python calls below
+    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
+    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel
+    local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py)
+    ln -s ${prefix} /opt/python/${abi_tag}
+}
+
+
+function build_cpython {
+    local py_ver=$1
+    check_var $py_ver
+    check_var $PYTHON_DOWNLOAD_URL
+    wget -q $PYTHON_DOWNLOAD_URL/$py_ver/Python-$py_ver.tgz
+    if [ $(lex_pyver $py_ver) -lt $(lex_pyver 3.3) ]; then
+        # NOTE We only need wide unicode for nupic.bindings wheel
+        do_cpython_build $py_ver ucs2
+        do_cpython_build $py_ver ucs4
+    else
+        do_cpython_build $py_ver none
+    fi
+    rm -f Python-$py_ver.tgz
+}
+
+
+function build_cpythons {
+    check_var $GET_PIP_URL
+    curl -sLO $GET_PIP_URL
+    for py_ver in $@; do
+        build_cpython $py_ver
+    done
+    rm get-pip.py
+}
+
+
+function do_openssl_build {
+    ./config no-ssl2 no-shared -fPIC --prefix=/usr/local/ssl > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function check_sha256sum {
+    local fname=$1
+    check_var ${fname}
+    local sha256=$2
+    check_var ${sha256}
+
+    echo "${sha256}  ${fname}" > ${fname}.sha256
+    sha256sum -c ${fname}.sha256
+    rm ${fname}.sha256
+}
+
+
+function build_openssl {
+    local openssl_fname=$1
+    check_var ${openssl_fname}
+    local openssl_sha256=$2
+    check_var ${openssl_sha256}
+    check_var ${OPENSSL_DOWNLOAD_URL}
+    curl -sLO ${OPENSSL_DOWNLOAD_URL}/${openssl_fname}.tar.gz
+    check_sha256sum ${openssl_fname}.tar.gz ${openssl_sha256}
+    tar -xzf ${openssl_fname}.tar.gz
+    (cd ${openssl_fname} && do_openssl_build)
+    rm -rf ${openssl_fname} ${openssl_fname}.tar.gz
+}
+
+
+function do_curl_build {
+    LIBS=-ldl ./configure --with-ssl --disable-shared > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function build_curl {
+    local curl_fname=$1
+    check_var ${curl_fname}
+    local curl_sha256=$2
+    check_var ${curl_sha256}
+    check_var ${CURL_DOWNLOAD_URL}
+    curl -sLO ${CURL_DOWNLOAD_URL}/${curl_fname}.tar.bz2
+    check_sha256sum ${curl_fname}.tar.bz2 ${curl_sha256}
+    tar -jxf ${curl_fname}.tar.bz2
+    (cd ${curl_fname} && do_curl_build)
+    rm -rf ${curl_fname} ${curl_fname}.tar.bz2
+}
+
+
+function do_standard_install {
+    ./configure > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function build_autoconf {
+    local autoconf_fname=$1
+    check_var ${autoconf_fname}
+    local autoconf_sha256=$2
+    check_var ${autoconf_sha256}
+    check_var ${AUTOCONF_DOWNLOAD_URL}
+    curl -sLO ${AUTOCONF_DOWNLOAD_URL}/${autoconf_fname}.tar.gz
+    check_sha256sum ${autoconf_fname}.tar.gz ${autoconf_sha256}
+    tar -zxf ${autoconf_fname}.tar.gz
+    (cd ${autoconf_fname} && do_standard_install)
+    rm -rf ${autoconf_fname} ${autoconf_fname}.tar.gz
+}
diff --git a/tools/manylinux1/build_scripts/manylinux1-check.py b/tools/manylinux1/build_scripts/manylinux1-check.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4bde065a293c9d8ea8c5b150246766328138fd4
--- /dev/null
+++ b/tools/manylinux1/build_scripts/manylinux1-check.py
@@ -0,0 +1,69 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+# Logic copied from PEP 513
+
+
+def is_manylinux1_compatible():
+    # Only Linux, and only x86-64 / i686
+    from distutils.util import get_platform
+    if get_platform() not in ["linux-x86_64", "linux-i686"]:
+        return False
+
+    # Check for presence of _manylinux module
+    try:
+        import _manylinux
+        return bool(_manylinux.manylinux1_compatible)
+    except (ImportError, AttributeError):
+        # Fall through to heuristic check below
+        pass
+
+    # Check glibc version. CentOS 5 uses glibc 2.5.
+    return have_compatible_glibc(2, 5)
+
+
+def have_compatible_glibc(major, minimum_minor):
+    import ctypes
+
+    process_namespace = ctypes.CDLL(None)
+    try:
+        gnu_get_libc_version = process_namespace.gnu_get_libc_version
+    except AttributeError:
+        # Symbol doesn't exist -> therefore, we are not linked to
+        # glibc.
+        return False
+
+    # Call gnu_get_libc_version, which returns a string like "2.5".
+    gnu_get_libc_version.restype = ctypes.c_char_p
+    version_str = gnu_get_libc_version()
+    # py2 / py3 compatibility:
+    if not isinstance(version_str, str):
+        version_str = version_str.decode("ascii")
+
+    # Parse string and check against requested version.
+    version = [int(piece) for piece in version_str.split(".")]
+    assert len(version) == 2
+    if major != version[0]:
+        return False
+    if minimum_minor > version[1]:
+        return False
+    return True
+
+
+import sys
+if is_manylinux1_compatible():
+    print("%s is manylinux1 compatible" % (sys.executable, ))
+    sys.exit(0)
+else:
+    print("%s is NOT manylinux1 compatible" % (sys.executable, ))
+    sys.exit(1)
diff --git a/tools/manylinux1/build_scripts/python-tag-abi-tag.py b/tools/manylinux1/build_scripts/python-tag-abi-tag.py
new file mode 100644
index 0000000000000000000000000000000000000000..301fbf07a47fef03c91d9dd5f49c2894a5971319
--- /dev/null
+++ b/tools/manylinux1/build_scripts/python-tag-abi-tag.py
@@ -0,0 +1,7 @@
+# Utility script to print the python tag + the abi tag for a Python
+# See PEP 425 for exactly what these are, but an example would be:
+#   cp27-cp27mu
+
+from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
+
+print("{0}{1}-{2}".format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))
diff --git a/tools/manylinux1/build_scripts/ssl-check.py b/tools/manylinux1/build_scripts/ssl-check.py
new file mode 100644
index 0000000000000000000000000000000000000000..900185cef14c51bca6a929801a86728b7ffc0b4a
--- /dev/null
+++ b/tools/manylinux1/build_scripts/ssl-check.py
@@ -0,0 +1,45 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+# cf. https://github.com/pypa/manylinux/issues/53
+
+GOOD_SSL = "https://google.com"
+BAD_SSL = "https://self-signed.badssl.com"
+
+import sys
+
+print("Testing SSL certificate checking for Python:", sys.version)
+
+if (sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4)):
+    print("This version never checks SSL certs; skipping tests")
+    sys.exit(0)
+
+if sys.version_info[0] >= 3:
+    from urllib.request import urlopen
+    EXC = OSError
+else:
+    from urllib import urlopen
+    EXC = IOError
+
+print("Connecting to %s should work" % (GOOD_SSL, ))
+urlopen(GOOD_SSL)
+print("...it did, yay.")
+
+print("Connecting to %s should fail" % (BAD_SSL, ))
+try:
+    urlopen(BAD_SSL)
+    # If we get here then we failed:
+    print("...it DIDN'T!!!!!11!!1one!")
+    sys.exit(1)
+except EXC:
+    print("...it did, yay.")
diff --git a/v1_api_demo/mnist/api_train.py b/v1_api_demo/mnist/api_train.py
index ea1caa7dd9653a2cc2860ace736fe3d25a3767e0..e42c6cbb7e0eed4f3a3625f18d79b3de64fd8e26 100644
--- a/v1_api_demo/mnist/api_train.py
+++ b/v1_api_demo/mnist/api_train.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 """
 A very basic example for how to use current Raw SWIG API to train mnist network.
 
diff --git a/v1_api_demo/mnist/mnist_provider.py b/v1_api_demo/mnist/mnist_provider.py
index 888cfef1e7e3e1b4f556756c003eeb23e741cabe..4192339837620aada84b64a92fef3e05953971c2 100644
--- a/v1_api_demo/mnist/mnist_provider.py
+++ b/v1_api_demo/mnist/mnist_provider.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer.PyDataProvider2 import *
 from mnist_util import read_from_mnist
 
diff --git a/v1_api_demo/quick_start/data/proc_from_raw_data/preprocess.py b/v1_api_demo/quick_start/data/proc_from_raw_data/preprocess.py
index 72bd95f21d8bde8b3d1962ea10ecf6fc7d0ea478..5706351a21fbd15d9bbf197156bb0fdabcb07295 100755
--- a/v1_api_demo/quick_start/data/proc_from_raw_data/preprocess.py
+++ b/v1_api_demo/quick_start/data/proc_from_raw_data/preprocess.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 # -*- coding: UTF-8 -*-
 
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
diff --git a/v1_api_demo/quick_start/trainer_config.bidi-lstm.py b/v1_api_demo/quick_start/trainer_config.bidi-lstm.py
index ca1d1f8d099b5a3f5276c108855c5e890e7214fe..3deff4aa00b1ea5d66097514867d1a392393a523 100644
--- a/v1_api_demo/quick_start/trainer_config.bidi-lstm.py
+++ b/v1_api_demo/quick_start/trainer_config.bidi-lstm.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 # edit-mode: -*- python -*-
 
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
diff --git a/v1_api_demo/quick_start/trainer_config.cnn.py b/v1_api_demo/quick_start/trainer_config.cnn.py
index f8c3d511f323ed9ec96be0a1951014c6db639003..e09e41484d30db385a1d276b7f346b444fe79d3d 100644
--- a/v1_api_demo/quick_start/trainer_config.cnn.py
+++ b/v1_api_demo/quick_start/trainer_config.cnn.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 # edit-mode: -*- python -*-
 
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
diff --git a/v1_api_demo/quick_start/trainer_config.emb.py b/v1_api_demo/quick_start/trainer_config.emb.py
index 7410397ef656e363b232787995d3a869cd11b655..f69f98ff7fc885d3fe16d3aaf66967389b3b3240 100644
--- a/v1_api_demo/quick_start/trainer_config.emb.py
+++ b/v1_api_demo/quick_start/trainer_config.emb.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 # edit-mode: -*- python -*-
 
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
diff --git a/v1_api_demo/quick_start/trainer_config.lr.py b/v1_api_demo/quick_start/trainer_config.lr.py
index e5105aa89532d71c80c8ec77ca98ac6a8e9c8c58..b7b694940e338acbc40ffd3e5597f209bf07488f 100644
--- a/v1_api_demo/quick_start/trainer_config.lr.py
+++ b/v1_api_demo/quick_start/trainer_config.lr.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 # edit-mode: -*- python -*-
 
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
diff --git a/v1_api_demo/quick_start/trainer_config.lstm.py b/v1_api_demo/quick_start/trainer_config.lstm.py
index 43b4ddac2dca5f6b9aa28f055e843abf12e92312..8967d78807b9bbf990f5dd36240c18199b86954e 100644
--- a/v1_api_demo/quick_start/trainer_config.lstm.py
+++ b/v1_api_demo/quick_start/trainer_config.lstm.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 # edit-mode: -*- python -*-
 
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
diff --git a/v1_api_demo/quick_start/trainer_config.resnet-lstm.py b/v1_api_demo/quick_start/trainer_config.resnet-lstm.py
index 89a837abb7cdeaaa249160123e1f2001d23d7aa1..32d0596f250c0f0c5a4004d3af7adb794b3f0f1b 100644
--- a/v1_api_demo/quick_start/trainer_config.resnet-lstm.py
+++ b/v1_api_demo/quick_start/trainer_config.resnet-lstm.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 # edit-mode: -*- python -*-
 
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved