Resolve conflicts.

10622ba3 · dangqingqing · 9d73950e · 7456d737 · 10622ba3 · 10622ba3
436 changed file
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/doc_cn/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -36,7 +36,7 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
    examples:
      - Optimized math operations through SSE/AVX intrinsics, BLAS libraries
-      (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
+      (e.g. MKL, OpenBLAS, cuBLAS) or customized CPU/GPU kernels.
      - Highly optimized recurrent networks which can handle **variable-length**
      sequence without padding.
      - Optimized local and distributed training for models with high dimensional

--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -3,7 +3,7 @@
 # It will search MKLML, atlas, OpenBlas, reference-cblas in order.
 #
 # If any cblas implementation found, the following variable will be set.
-#    CBLAS_PROVIDER  # one of MKLML, ATLAS, OPENBLAS, REFERENCE
+#    CBLAS_PROVIDER  # one of MKLML, OPENBLAS, REFERENCE
 #    CBLAS_INC_DIR   # the include directory for cblas.
 #    CBLAS_LIBS      # a list of libraries should be linked by paddle.
 #                    # Each library should be full path to object file.
@@ -17,7 +17,7 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
  set(CBLAS_INC_DIR ${MKLML_INC_DIR})
  set(CBLAS_LIBRARIES ${MKLML_LIB})
-  add_definitions(-DPADDLE_USE_MKLML)
+  add_definitions(-DPADDLE_WITH_MKLML)
  add_definitions(-DLAPACK_FOUND)
  message(STATUS "Found cblas and lapack in MKLML "
@@ -25,42 +25,6 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
  return()
 endif()
-## Then find atlas.
-set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
-set(ATLAS_INCLUDE_SEARCH_PATHS
-        ${ATLAS_ROOT}/include
-        /usr/include
-        /usr/include/atlas)
-set(ATLAS_LIB_SEARCH_PATHS
-        ${ATLAS_ROOT}/lib
-        /usr/lib
-        /usr/lib/blas/atlas
-        /usr/lib/atlas
-        /usr/lib/atlas-base   # special for ubuntu 14.04.
-    )
-find_path(ATLAS_INC_DIR NAMES cblas.h
-  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
-  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
-  PATHS ${ATLAS_LIB_SEARCH_PATHS})
-find_library(ATLAS_CLAPACK_LIB NAMES lapack_atlas liblapack_atlas.so.3
-  PATHS ${ATLAS_LIB_SEARCH_PATHS})
-if(ATLAS_CLAPACK_INC_DIR AND ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_CLAPACK_LIB)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER ATLAS)
-  set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
-  set(CBLAS_LIBRARIES ${ATLAS_CLAPACK_LIB} ${ATLAS_CBLAS_LIB})
-  add_definitions(-DPADDLE_USE_ATLAS)
-  add_definitions(-DLAPACK_FOUND)
-  message(STATUS "Found ATLAS (include: ${ATLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
-  return()
-endif()
 ## Then find openblas.
 set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
 set(OPENBLAS_INCLUDE_SEARCH_PATHS

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -67,5 +67,5 @@ ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
 ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
 MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
-add_definitions(-DPADDLE_USE_MKLDNN)
+add_definitions(-DPADDLE_WITH_MKLDNN)
 LIST(APPEND external_project_dependencies mkldnn)
--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
@@ -7,3 +7,4 @@ API
    模型配置 <v2/model_configs.rst>
    数据访问 <v2/data.rst>
    训练与应用 <v2/run_logic.rst>
+    v2/fluid.rst
--- a/doc/design/fluid-compiler.graffle
+++ b/doc/design/fluid-compiler.graffle
--- a/doc/design/fluid-compiler.png
+++ b/doc/design/fluid-compiler.png
--- a/doc/design/fluid.md
+++ b/doc/design/fluid.md
+# Design Doc: PaddlePaddle Fluid
+## Why Fluid
+When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe.  However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework?
+Fluid is the answer.  Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model.  In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no  concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning.
+## The Evolution of Deep Learning Systems
+Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
+| Existed since | model as sequence of layers | model as graph of operators | No model |
+|--|--|--|--|
+| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
+| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
+| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
+From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model.  To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
+## Deep Learning Programming Paradigms
+With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following:
+```python
+x = layer.data("image")
+l = layer.data("label")
+f = layer.fc(x, W)
+s = layer.softmax(f)
+c = layer.mse(l, s)
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    forward({input=x, data=m}, minimize=c)
+    backward(...)
+print W # print the trained model parameters.
+```
+The above program includes two parts:
+1. The first part describes the model, and
+2. The second part describes the training process (or inference process) for the model.
+This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt.
+This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general,  prefer PyTorch over the older systems.  Using PyTorch, we would write the above program as following:
+```python
+W = tensor(...)
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    x = m["image"]
+    l = m["label"]
+    f = layer.fc(x, W)
+    s = layer.softmax(f)
+    c = layer.mse(l, s)
+    backward()
+print W # print the trained model parameters.
+```
+We can see that the main difference is the moving the model configuration part (the first step) into the training loop.  This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block.  This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop.
+## Describe Arbitrary Models for the Future
+Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet.
+As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator.  A PyTorch example would look like the following:
+```python
+for i in xrange(1000):
+    m = read_minibatch()
+    x = m["sentence"]
+    for t in xrange x.len():
+        h[t] = the_step(x[t])
+```        
+With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following:
+```python
+train_loop = layers.While(cond)
+with train_loop.block():
+  m = read_minibatch()
+  x = m["sentence"]
+  rnn = layers.While(...)
+  with rnn.block():
+    h[t] = the_step(input[t])
+```    
+An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44).
+From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
+## Turing Completeness
+In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine.  For a programming language, if it provides if-then-else and loop, it is Turing complete.  From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there  is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete.  Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth.
+## The Execution of a Fluid Program
+There are two ways to execute a Fluid program.  When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
+Fluid is moving towards the direction of a compiler, which is explain in more detail later in this article.
+## Backward Compatibility of Fluid
+Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference.  For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph).  Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators.  The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators.
+For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format.
+## Towards a Deep Learning Language and the Compiler
+We can change the `if-then-else` and loop structure a little bit in the above Fluid example programs, to make it into a new programming language, different than Python.
+Even if we do not invent a new language, as long as we get the `ProgramDesc` message filled in, we can write a transpiler, which translates each invocation to an operator, into a C++ call to a kernel function of that operator. For example, a transpiler that weaves the CUDA kernels outputs an NVIDIA-friendly C++ program, which can be built using `nvcc`.  Another transpiler could generate MKL-friendly code that should be built using `icc` from Intel.  More interestingly, we can translate a Fluid program into its distributed version of two `ProgramDesc` messages, one for running on the trainer process, and the other one for the parameter server.  For more details of the last example, the [concurrent programming design](concurrent_programming.md) document would be a good pointer.  The following figure explains the proposed two-stage process:
+![](fluid-compiler.png)
--- a/doc/design/images/multigpu_allreduce.graffle
+++ b/doc/design/images/multigpu_allreduce.graffle
--- a/doc/design/images/multigpu_allreduce.png
+++ b/doc/design/images/multigpu_allreduce.png
--- a/doc/design/images/multigpu_before_convert.graffle
+++ b/doc/design/images/multigpu_before_convert.graffle
--- a/doc/design/images/multigpu_before_convert.png
+++ b/doc/design/images/multigpu_before_convert.png
--- a/doc/design/mkldnn/image/engine.png
+++ b/doc/design/mkldnn/image/engine.png
--- a/doc/design/mkldnn/image/gradients.png
+++ b/doc/design/mkldnn/image/gradients.png
--- a/doc/design/mkldnn/image/layers.png
+++ b/doc/design/mkldnn/image/layers.png
--- a/doc/design/mkldnn/image/matrix.png
+++ b/doc/design/mkldnn/image/matrix.png
--- a/doc/design/mkldnn/image/overview.png
+++ b/doc/design/mkldnn/image/overview.png
--- a/doc/design/mkl/mkl_packed.md
+++ b/doc/design/mkl/mkl_packed.md
+# Intel® MKL Packed on PaddlePaddle: Design Doc
+## Contents
+- [Overview](#overview)
+- [Key Points](#key-points) 
+   - [Background](#background)
+   - [Solution](#solution)
+- [Actions](#actions)
+    - [CMake](#cmake)
+	- [Layers](#layers)
+	- [Unit Tests](#unit-tests)
+	- [Python API](#python-api)
+	- [Benchmarking](#benchmarking)
+## Overview
+我们计划将 Intel® MKL 中引入的 GEMM Packed APIs\[[1](#references)\] 集成到 PaddlePaddle 中，充分发挥英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+现阶段的优化主要针对 Recurrent Neural Network（以下简称RNN）相关层（包括`RecurrentLayer`, `GatedRecurrentLayer`和`LstmLayer`）， 以及 PaddlePaddle V1 API。
+## Key Points
+### Background
+目前PaddlePaddle采用了 Intel® MKL库的[cblas_?gemm](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm)函数，这个函数本身会在计算前将原数据转换为更适合英特尔平台的内部格式。
+1. 转换耗时 \
+这一数据格式的转换操作（Packing），在问题本身的计算量比较小的时候，显得相对来说较为耗时。例如在DeepSpeech2 \[[2](#references)\] 的Vanilla RNN部分中，矩阵大小是`batch_size * 2048`。
+2. 转换冗余 \
+由于在现有的某些情况下（例如RNN），多次调用 cblas_?gemm 会使用相同的原数据，因此，每次调用时对原数据的重复Packing便成为了冗余。
+为了最大程度减少多次调用 cblas_?gemm 在Packing上的耗时，Intel® MKL 引入了以下四个API:
+   * cblas_?gemm_alloc
+   * cblas_?gemm_pack 
+   * cblas_?gemm_compute
+   * cblas_?gemm_free
+通过使用这些API，我们可以先完成对原数据的Packing操作，再把已转换为Packed格式的数据传递给那些复用同一数据的gemm_compute函数，从而避免了Packing冗余。
+### Solution
+在RNN的情况下，同一次前向、后向（forward/backward）过程中所有时间步（time step）共享同一个权重（weight）。当只做推断（inference）时，各次前向之间也都使用了相同的权重，没有必要在每次前向中每个时间步的计算时对权重进行重复的Packing操作。
+我们通过使用新引入的GEMM Packed APIs，在层初始化的时候，先完成对权重的Packing操作，然后在前向，后向时复用已经转换过的权重，并在每次权重更新后，对新的权重进行转换用于下次迭代。
+* 优化前，对于序列长度（sequence length）为`T`的网络模型（model）, `N`次迭代执行的转换次数为：
+  - `inference`： `N * T`  
+  - `training`： `2 * N * T`
+* 优化后，对于同样设置的网络模型，其转换次数减少至：
+  - `inference`： `1`    
+  - `training`： `2 * N`
+## Actions
+添加的相关文件和目录结构如下：
+```txt
+PaddlePaddle/Paddle
+├── ...
+└── paddle/
+    ├── ...
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   ├── MKLPackedRecurrentLayer.*
+        |   ├── MKLPackedGatedRecurrentLayer.*
+        |   ├── MKLPackedLstmLayer.*
+        |   └── MKLPackedGemm.h
+        └── tests/
+            ├── ...
+            └── test_MKLPacked.cpp
+```
+### CMake
+在对应的`CMakeLists.txt`中根据`WITH_MKL`是否打开，来决定是否开启MKL Packed相关功能。
+### Layers
+所有的`MKLPacked*Layer`都继承于PaddlePaddle的基类`Layer`, 并添加头文件 `MKLPackedGemm.h`，该文件对相关GEMM Packed APIs做了封装。
+### Unit Tests
+我们会添加`test_MKLPacked.cpp`用于MKL Packed优化后layer的测试。
+对于每一个新加的RNN layer，我们会对比如下2个方面：
+1. 对比优化后layer自身，sequence mode（`rnn_use_batch=false`）与batch mode(`rnn_use_batch=true`)的结果。
+2. 对比优化后layer与相对应的PaddlePaddle原有layer, 在batch mode下的结果。
+### Python API
+TBD
+### Benchmarking
+会添加相应的脚本用于测试和对比在使用MKL Packed recurrent layers 前后的网络性能。
+## References 
+1. [Introducing the new Packed APIs for GEMM](https://software.intel.com/en-us/articles/introducing-the-new-packed-apis-for-gemm)
+2. [DeepSpeech2 on PaddlePaddle](https://github.com/PaddlePaddle/DeepSpeech#deepspeech2-on-paddlepaddle)
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@@ -208,4 +208,3 @@ if use_mkldnn
 但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
 4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`，所以不存在这个问题)。
 所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
--- a/doc/design/paddle_nccl.md
+++ b/doc/design/paddle_nccl.md
+# Design Doc: NCCL support in Paddle Fluid
+## Abstract
+This Design Doc refers to the NCCL feature in  paddle.  We propose an approach to support NCCL library both on a single machine and multiple machines. We wrapper the NCCL primitives `Broadcast`, `Allreduce`, `Reduce` as operators to utilize Multi-GPU powers in one script.
+## Motivation
+[NCCL](https://developer.nvidia.com/nccl) is a NVIDIA library support Multi-GPU communicating and optimized for NVIDIA GPUs, it provides routines such as all-gather, all-reduce, broadcast, reduce, reduce-scatter, that can achieve high bandwidth over PCIe and NVLink high-speed interconnect. With NCCL library, we can easily accelerate the training in parallel. 
+- Pros
+1. easily plug-in with [NCCL2](https://developer.nvidia.com/nccl) library.
+1. high performance in NVIDIA GPUs.
+1. MPI like primitives, which have low learning cost for users.
+- Cons
+1. Only design for NVIDIA GPUs, not a general multi-device solution.
+1. Although NCCL1 is opensourced under BSD license, but NCCL2 is not opensourced anymore.
+At the beginning of training, the framework needs to distribute the same parameters to every GPU, and merge the gradients at any time user interests.
+As a result, during training, we need the operations of peer to peer copy between different GPUs, aggregating gradients/parameters from GPUs, and broadcasting parameters to GPUs. Every GPU only need to run the operator with correct place information.
+Besides, it needs interfaces to synchronize model update with each different GPU Cards. 
+## Implementation
+As mentioned above, we wrap the NCCL routines as several kinds of operators. Need to note that NCCL need to create Communicator between gpu at the beginning, so there is a NCCLInit operator created.
+### Transpiler
+To be compatible with [parameter server design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md), the transpiler compiles the user defined operation graph into sub-graphs to be executed on different devices.
+1. The user-defined model will be a single device program
+2. Broadcast/Reduce operators between GPUs will be inserted into the program, even for the multi-node, may insert the `Send`, `Recv` operator.
+   *Broadcast, AllReduce in a single machine. And Broadcast, AllReduce, [Send, Recv](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md#graph-converter) in multiple machines*
+   <img src="images/multigpu_before_convert.png" width="300"/>
+After compiling, the graph as shows
+<img src="images/multigpu_allreduce.png" width="1000"/>
+Operators are added to the sub-graphs. Every GPU assigned a role of `rank0`, `rank1` etc. 
+- **Broadcast**. Broadcast operator distribute initialized parameter to all the GPUs from the GPU who owns it. e.g. from`rank0` GPU.
+- **AllReduce**. AllReduce operator synchronizes parameters/gradients between GPUs. AllReduce implemented in the Ring-Based  communicating method, avoid of the bottle neck in a single GPU.
+Need to notice that AllReduce operator force GPUs synchronized at that point. The whole training process in asynchronous or synchronous mode depends on the AllReduce point in the graph.
+As it shown in the picture, when each GPU compute the gradient of `W`, followed with a `AllReduce` operator, accumulate the `dW` to full batch of data, then run the optimize process individually and apply the gradient to its `W`.
+- **AllReduce**
+  Need to note that our AllReduce operator is a ring-base AllReduce implementation. If we use the NCCL2 AllReduce primitive, every GPU optimized full batch of data, wasted (n-1) GPU compute resources. In addition, NCCL2 built-in AllReduce will only utilize the communicating resource during synchronization, then update the gradient will be a subsequent phase. In fact, we can amortize the update gradient time cost into the communicating phase. The process is
+1. Every parameter has its root card. That card will responsible for aggregating the gradients from GPUs.
+2. The whole model's parameter will be hashed to different root card, ensure the load balance between GPUs.
+3. Logically neighberhood card will start send parameter to the next one. After one round, the parameter main card will aggregate the full gradients.
+4. Then the root card will optimize the parameter.
+5. This parameter card will send its optimized result to its neighberhood, then the neighberhood will send parameter to its next one.
+6. Finish the sychronization round.
+The total time cost will be 2 * (n-1) * per-parameter-send-time, we reach the goal of amortize the upgrade time into communicating phase.
--- a/doc/design/support_new_device.md
+++ b/doc/design/support_new_device.md
+# Design Doc: Supporting new Device/Library
+## Background
+Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries flexibly and efficiently.
+On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example,Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
+On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
+So, how to support a new Device/Library in Fluid becomes a challenge.
+## Basic: Integrate A New Device/Library
+For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/read_source.md).
+There are mainly three parts that we have to consider while integrating a new device/library:
+- Place and DeviceContext: indicates the device id and manages hardware resources
+- Memory and Tensor: malloc/free data on certain device
+- Math Functor and OpKernel: implement computing unit on certain devices/libraries
+### Place and DeviceContext
+#### Place
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent different devices and computing libraries. There are inheritance relationships between different kinds of `Place`.
+```
+        |   CPUPlace   --> MKLDNNPlace
+Place --|   CUDAPlace  --> CUDNNPlace
+        |   FPGAPlace
+```
+And `Place` is defined as follows:
+```
+typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;
+```
+#### DeviceContext
+Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different hardwares, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
+```
+                /->  CPUDeviceContext   --> MKLDeviceContext
+DeviceContext ---->  CUDADeviceContext  --> CUDNNDeviceContext
+                \->  FPGADeviceContext
+```
+An example of Nvidia GPU is as follows:
+- DeviceContext
+```
+class DeviceContext {
+  virtual Place GetPlace() const = 0;
+};  
+```
+- CUDADeviceContext
+```
+class CUDADeviceContext : public DeviceContext {
+  Place GetPlace() const override { return place_; }
+private:
+  CUDAPlace place_;
+  cudaStream_t stream_; 
+  cublasHandle_t cublas_handle_;
+  std::unique_ptr<Eigen::GpuDevice> eigen_device_;  // binds with stream_
+};
+```
+- CUDNNDeviceContext
+```
+class CUDNNDeviceContext : public CUDADeviceContext {
+  private:
+    cudnnHandle_t cudnn_handle_;
+};
+```
+### Memory and Tensor
+#### memory module
+Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/memory/memory.h#L36):
+```
+template <typename Place>
+void* Alloc(Place place, size_t size);
+template <typename Place>
+void Free(Place place, void* ptr);
+template <typename Place>
+size_t Used(Place place);
+```
+To implementing these interfaces, we have to implement MemoryAllocator for different Devices
+#### Tensor
+[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h#L36) holds data with some shape in a specific Place.
+```cpp
+class Tensor {
+ public:
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+ private:
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+  /*! points to dimensions of memory block. */
+  DDim dim_;
+};
+```
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+### Math Functor and OpKernel
+Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors.
+Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
+The interface is defined in header file.
+```
+template <typename DeviceContext, typename T>
+class MaxOutFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* output, int groups);
+};
+```
+CPU implemention is in .cc file
+```
+template <typename T>
+class MaxOutFunctor<platform::CPUDeviceContext, T> {
+  public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};
+```
+CUDA implemention is in .cu file
+```
+template <typename T>
+class MaxOutFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};                  
+```
+We get computing handle from a concrete DeviceContext, and make compution on tensors.
+The implemention of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
+Fluid provides different register interfaces in op_registry.h
+Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/crop_op.cc#L134) operator as an example:
+In .cc file:
+```
+REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CPU_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
+```
+In .cu file:
+```
+REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CUDA_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
+```
+## Advanced topics: How to switch between different Device/Library
+Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
+We will discuss how to implement an efficient OpKernel switch policy. 
+- TBD
--- a/doc/faq/build_and_install/index_cn.rst
+++ b/doc/faq/build_and_install/index_cn.rst
@@ -14,7 +14,7 @@
    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
 更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/doc_cn/build_and_install/install/docker_install.html>`_ 。

--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -114,7 +114,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
  .. code-block:: bash
-     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
 **注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
@@ -122,7 +122,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
 **关于AVX：**

--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -122,7 +122,7 @@ GPU driver installed before move on.
  .. code-block:: bash
-     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
 **NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
@@ -130,7 +130,7 @@ GPU driver installed before move on.
     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
 **About AVX:**

--- a/doc/getstarted/concepts/src/infer.py
+++ b/doc/getstarted/concepts/src/infer.py
+import paddle.v2 as paddle
+import numpy as np
+paddle.init(use_gpu=False)
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
+y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+# loading the model which generated by training
+with open('params_pass_90.tar', 'r') as f:
+    parameters = paddle.parameters.Parameters.from_tar(f)
+# Input multiple sets of data，Output the infer result in a array.
+i = [[[1, 2]], [[3, 4]], [[5, 6]]]
+print paddle.infer(output_layer=y_predict, parameters=parameters, input=i)
+# Will print:
+# [[ -3.24491572]
+#  [ -6.94668722]
+#  [-10.64845848]]
--- a/doc/getstarted/concepts/src/train.py
+++ b/doc/getstarted/concepts/src/train.py
@@ -26,6 +26,11 @@ def event_handler(event):
        if event.batch_id % 1 == 0:
            print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id,
                                                  event.cost)
+    # product model every 10 pass
+    if isinstance(event, paddle.event.EndPass):
+        if event.pass_id % 10 == 0:
+            with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
 # define training dataset reader

--- a/doc/getstarted/concepts/use_concepts_cn.rst
+++ b/doc/getstarted/concepts/use_concepts_cn.rst
@@ -147,4 +147,9 @@ PaddlePaddle支持不同类型的输入数据，主要包括四种类型，和
 ..  literalinclude:: src/train.py
    :linenos:
+使用以上训练好的模型进行预测，取其中一个模型params_pass_90.tar，输入需要预测的向量组，然后打印输出：
+..  literalinclude:: src/infer.py
+    :linenos:
 有关线性回归的实际应用，可以参考PaddlePaddle book的 `第一章节 <http://book.paddlepaddle.org/index.html>`_。
--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -76,18 +76,18 @@ no changes added to commit (use "git add" and/or "git commit -a")
 ## 构建和测试
-编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家，我们的标准开发流程是把这些工具都装进一个Docker image，称为*开发镜像*，通常名字是 `paddle:dev`。然后所有用 `cmake && make` 的地方（比如IDE配置里）都用 `docker run paddle:dev`来代替。
+编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家，我们的标准开发流程是把这些工具都装进一个Docker image，称为*开发镜像*，通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方（比如IDE配置里）都用 `docker run paddle:latest-dev`来代替。
 如要build这个开发镜像，在源码目录树的根目录中运行：
 ```bash
-➜  docker build -t paddle:dev .
+➜  docker build -t paddle:latest-dev .
 ```
 随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU，但是支持AVX指令集，并且包括unit tests的PaddlePaddle，可以：
 ```bash
-➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev
+➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev
 ```
 这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`，并且输出一个 `./build/paddle.deb`文件之外，还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*（`paddle:prod`）：
@@ -99,7 +99,7 @@ no changes added to commit (use "git add" and/or "git commit -a")
 如果要运行所有的单元测试，可以用如下命令：
 ```bash
-➜  docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+➜  docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
 ```
 关于构建和测试的更多信息，请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。

--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
 # 如何写新的Operator
 - [概念简介](#概念简介)
- - [实现C++类](#实现C++类)
+ - [实现C++类](#实现c类)
-   - [定义ProtoMaker类](#定义ProtoMaker类)
+   - [定义ProtoMaker类](#定义protomaker类)
-   - [定义Operator类](#定义Operator类)
+   - [定义Operator类](#定义operator类)
-   - [定义OpKernel类](#定义OpKernel类)
+   - [定义OpKernel类](#定义opkernel类)
-   - [注册Operator](#注册Operator)
+   - [注册Operator](#注册operator)
   - [编译](#编译)
- - [绑定Python](#绑定Python)
+ - [绑定Python](#绑定python)
 - [实现单元测试](#实现单元测试)
-   - [前向Operator单测](#前向Operator单测)
+   - [前向Operator单测](#前向operator单测)
-   - [反向Operator单测](#反向Operator单测)
+   - [反向Operator单测](#反向operator单测)
   - [编译和执行](#编译和执行)
+ - [注意事项](#注意事项)
 ## 概念简介
@@ -30,8 +31,8 @@
 --------------  | :----------------------
 OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
 Op定义           | `.cc`文件
-Kernel实现       | CPU、GPU共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，GPU 实现在`.cu`文件中。
+Kernel实现       | CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。
-注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，GPU实现在`.cu`文件中
+注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中
 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
@@ -43,7 +44,7 @@ Kernel实现       | CPU、GPU共享Kernel实现在`.h`文件中，否则，CPU
 ## 实现C++类
-### 1. 定义ProtoMaker类
+### 定义ProtoMaker类
 矩阵乘法的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。
@@ -100,7 +101,7 @@ The equation is: Out = scale*X
 - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
-### 2. 定义Operator类
+### 定义Operator类
 下面的点实现了MulOp的定义：
@@ -149,11 +150,11 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和下面将要介绍的注册函数一起放在`.cc`中
-### 3. 定义OpKernel类
+### 定义OpKernel类
 `MulKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
- `typename  Place`: 表示设备类型，不同设备(CPU、GPU)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+- `typename DeviceContext`: 表示设备类型，不同设备(CPU、CUDA)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
 - `typename T` : 表示数据类型，如`float`, `double`等。
@@ -165,7 +166,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 下面是 `MulKernel` `Compute`的实现：
  ```cpp
-  template <typename Place, typename T>
+  template <typename DeviceContext, typename T>
  class MulKernel : public framework::OpKernel {
  public:
  void Compute(const framework::ExecutionContext& context) const override {
@@ -173,33 +174,32 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
    auto* Y = context.Input<Tensor>("Y");
    auto* Z = context.Output<Tensor>("Out");
    Z->mutable_data<T>(context.GetPlace());
-    auto* device_context =
+    auto& device_context = context.template device_context<DeviceContext>();
-        const_cast<platform::DeviceContext*>(context.device_context_);
+    math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
-    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
  }
  };
  ```
-需要注意：**不同设备(CPU、GPU)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
+需要注意：**不同设备(CPU、CUDA)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
-`MulOp`的CPU、GPU实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
-为了使`OpKernel`的计算过程书写更加简单，并且CPU、GPU的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
+为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
 到此，前向Op实现完成。接下来，需要在`.cc`文件中注册该op和kernel。
 反向Op类的定义，反向OpKernel的定义与前向Op类似，这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
-### 4. 注册Operator
+### 注册Operator
 - 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
    ```cpp
    namespace ops = paddle::operators;
    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
-    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
    REGISTER_OP_CPU_KERNEL(mul_grad,
-                  ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
    ```
   在上面的代码中：
@@ -209,20 +209,20 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
- 在 `.cu`文件中注册GPU Kernel。
+- 在 `.cu`文件中注册CUDA Kernel。
-    - 请注意，如果GPU Kernel的实现基于Eigen unsupported模块，那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`，代码示例如下：
+    - 请注意，如果CUDA Kernel的实现基于Eigen unsupported模块，那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`，代码示例如下：
    ```cpp
    // if use Eigen unsupported module before include head files
-    // #define EIGEN_USE_GPU
+    #define EIGEN_USE_GPU
    namespace ops = paddle::operators;
-    REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
+    REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
-    REGISTER_OP_GPU_KERNEL(mul_grad,
+    REGISTER_OP_CUDA_KERNEL(mul_grad,
-                           ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+                           ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
    ```
-### 5. 编译
+### 编译
 运行下面命令可以进行编译：
@@ -236,71 +236,57 @@ make mul_op
 ## 实现单元测试
-单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
-### 前向Operator单元测试
+### 前向Operator单测
-前向Op单元测试继承自`unittest.TestCase`，并定义元类`__metaclass__ = OpTestMeta`。各项更加具体的单元测试在`OpTestMeta`里完成。测试前向Operator，需要：
+Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator，需要：
 1. 在`setUp`函数定义输入、输出，以及相关的属性参数。
 2. 生成随机的输入数据。
 3. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比。
+4. 反向计算已经自动集成进测试框架，直接调用相应接口即可。
  ```python
  import unittest
  import numpy as np
-  from gradient_checker import GradientChecker, create_op
+  from op_test import OpTest
-  from op_test_util import OpTestMeta
-  class TestMulOp(unittest.TestCase):
-      __metaclass__ = OpTestMeta
+  class TestMulOp(OpTest):
      def setUp(self):
-          self.type = "mul"
+          self.op_type = "mul"
          self.inputs = {
              'X': np.random.random((32, 84)).astype("float32"),
              'Y': np.random.random((84, 100)).astype("float32")
          }
          self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
-  ```
-上面的代码首先导入依赖的包，下面是对`setUp`函数中操作的重要变量的详细解释：
- `self.type = "mul" ` : 定义类型，与operator注册时注册的类型一致。
- `self.inputs` : 定义输入，类型为`numpy.array`，并初始化。
- `self.outputs` : 定义输出，并在Python脚本中完成与operator同样的计算逻辑，返回Python端的计算结果。
+      def test_check_output(self):
+          self.check_output()
-### 反向Operator单元测试
+      def test_check_grad_normal(self):
+          self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-反向Op单元测试继承自`GradientChecker`，而`GradientChecker`继承自`unittest.TestCase`，因此，**反向单元测试函数需要以`test_`开头**。
+      def test_check_grad_ingore_x(self):
+          self.check_grad(
+              ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-```python
+      def test_check_grad_ingore_y(self):
-class TestMulGradOp(GradientChecker):
+          self.check_grad(
-    def setUp(self):
+              ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-        self.op = create_op("mul")
+  ```
-        self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((84, 100)).astype("float32")
-        }
-    def test_check_grad_normal(self):
-        # mul op will enlarge the relative error
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-    def test_check_grad_ingore_x(self):
+上面的代码首先导入依赖的包，下面是对`setUp`函数中操作的重要变量的详细解释：
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-    def test_check_grad_ingore_y(self):
+- `self.op_type = "mul" ` : 定义类型，与operator注册时注册的类型一致。
-        self.check_grad(
+- `self.inputs` : 定义输入，类型为`numpy.array`，并初始化。
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+- `self.outputs` : 定义输出，并在Python脚本中完成与operator同样的计算逻辑，返回Python端的计算结果。
-```
-下面解释代码中一些关键的地方:
+### 反向operator单测
- 调用`create_op("mul")`创建反向Op对应的前向Op。
+而反向测试中：
 - `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
  - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
  - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
@@ -308,7 +294,7 @@ class TestMulGradOp(GradientChecker):
 - `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
-### 编译和执行单元测试
+### 编译和执行
 `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
@@ -328,5 +314,5 @@ ctest -R test_mul_op
 - 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。不允许一个文件中包含多个Op，这将会导致编译出错。
 - 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`等，这将会导致单元测试出错。
- 如果Op没有实现GPU Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
+- 如果Op没有实现CUDA Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
 - 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
 # How to write a new operator
 - [Background](#background)
- - [Implementing C++ Types](#implementing-c++-types)
+ - [Implementing C++ Types](#implementing-c-types)
-   - [Defining ProtoMaker](#defining-protoMaker)
+   - [Defining ProtoMaker](#defining-protomaker)
   - [Defining Operator](#defining-operator)
   - [Registering Operator](#registering-operator)
   - [Compilation](#compilation)
@@ -28,8 +28,8 @@ An operator can be differentiated by whether in has kernel methods. An operator
 --------------  | :----------------------
 OpProtoMake definition  | `.cc`files, Backward Op does not need an OpProtoMake interface.
 Op definition           | `.cc` files
-Kernel implementation       | The kernel methods shared between CPU and GPU are defined in `.h` files. CPU-specific kernels live in `.cc` files, while GPU-specific kernels are implemented in `.cu`files.
+Kernel implementation       | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
-Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the GPU implementation.
+Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
 New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. **
@@ -41,7 +41,7 @@ Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePadd
 ## Implementing C++ Types
-### 1. Defining Class ProtoMaker
+### Defining ProtoMaker
 Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output.
@@ -98,7 +98,7 @@ There are two changes in this example:
 - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);`  adds `scale`constant as an attribute, and sets the default value to 1.0.
-### 2. Defining Operator
+### Defining Operator
 The following code defines the interface for MulOp:
@@ -147,11 +147,11 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later.
-### 3. Defining OpKernel
+### Defining OpKernel
 `MulKernel` inherits `framework::OpKernel`, which includes the following templates:
- `typename  Place` denotes device type. When different devices, namely the CPU and the GPU, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+- `typename  DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
 - `typename T` denotes data type, such as `float` or `double`.
@@ -163,7 +163,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
 `MulKernel`'s implementation of `Compute` is as follows:
  ```cpp
-  template <typename Place, typename T>
+  template <typename DeviceContext, typename T>
  class MulKernel : public framework::OpKernel {
  public:
  void Compute(const framework::ExecutionContext& context) const override {
@@ -171,16 +171,15 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
    auto* Y = context.Input<Tensor>("Y");
    auto* Z = context.Output<Tensor>("Out");
    Z->mutable_data<T>(context.GetPlace());
-    auto* device_context =
+    auto& device_context = context.template device_context<DeviceContext>();
-        const_cast<platform::DeviceContext*>(context.device_context_);
+    math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
-    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
  }
  };
  ```
-Note that **different devices (CPU, GPU)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
+Note that **different devices (CPU, CUDA)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
-`MulOp`'s CPU and GPU share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
 To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
@@ -189,16 +188,16 @@ This concludes the forward implementation of an operator. Next its operation and
 The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.
-### 4. Registering Operator
+### Registering Operator
 - In `.cc` files, register forward and backward operator classes and the CPU kernel.
    ```cpp
    namespace ops = paddle::operators;
    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
-    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
    REGISTER_OP_CPU_KERNEL(mul_grad,
-                  ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
    ```
   In that code block,
@@ -208,20 +207,20 @@ The definition of its corresponding backward operator, if applicable, is similar
    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
- Registering GPU Kernel in `.cu` files
+- Registering CUDA Kernel in `.cu` files
-    - Note that if GPU Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
+    - Note that if CUDA Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
    ```cpp
    // if use Eigen unsupported module before include head files
    #define EIGEN_USE_GPU
    namespace ops = paddle::operators;
-    REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
+    REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
-    REGISTER_OP_GPU_KERNEL(mul_grad,
+    REGISTER_OP_CUDA_KERNEL(mul_grad,
-                           ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+                           ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
    ```
-### 5. Compilation
+### Compilation
 Run the following commands to compile.
@@ -253,62 +252,51 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass
 2. Generating random input data.
-3. Implementing the same computation logic in a Python script:
+3. Implementing the same computation logic in a Python script.
+4. Call check gradient function to check the backward operator.
  ```python
  import unittest
  import numpy as np
-  from gradient_checker import GradientChecker, create_op
+  from op_test import OpTest
-  from op_test_util import OpTestMeta
-  class TestMulOp(unittest.TestCase):
-      __metaclass__ = OpTestMeta
+  class TestMulOp(OpTest):
      def setUp(self):
-          self.type = "mul"
+          self.op_type = "mul"
          self.inputs = {
              'X': np.random.random((32, 84)).astype("float32"),
              'Y': np.random.random((84, 100)).astype("float32")
          }
          self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+      def test_check_output(self):
+          self.check_output()
+      def test_check_grad_normal(self):
+          self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+      def test_check_grad_ingore_x(self):
+          self.check_grad(
+              ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+      def test_check_grad_ingore_y(self):
+          self.check_grad(
+              ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
  ```
 Get its output, and compare it with the forward operator's own output.
 The code above first loads required packages. In addition, we have
- `self.type = "mul" ` defines the type that is identical to what the operator's registered type.
+- `self.op_type = "mul" ` defines the type that is identical to what the operator's registered type.
 - `self.inputs` defines input, with type `numpy.array` and initializes it.
 - `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script.
 ### Testing Backward Operators
-A backward operator unit test inherits `GradientChecker`, which inherits `unittest.TestCase`. As a result, **a backward operator unit test needs to be have the prefix `test_`**.
+Some key points in checking gradient above include:
-```python
-class TestMulGradOp(GradientChecker):
-    def setUp(self):
-        self.op = create_op("mul")
-        self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((84, 100)).astype("float32")
-        }
-    def test_check_grad_normal(self):
-        # mul op will enlarge the relative error
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-```
-Some key points in the code above include:
- `create_op("mul")` creates the backward operator's corresponding forward operator.
 - `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
  - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
  - The second variable `"Out"` points to the network's final output target `Out`.
@@ -338,5 +326,5 @@ ctest -R test_mul_op
 - Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file.
 - The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures.
- If the operator does not implement a GPU kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
+- If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
 - If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`.
--- a/doc/howto/read_source.md
+++ b/doc/howto/read_source.md
+# PaddlePaddle Fluid Source Code Overview
+Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/tests/book
+Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework
+Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators
+Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory
+Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform
+# Compile Time
+The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto).
+```python
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+```
+- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#L93)
+- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/layers.py)
+  - Every Layer has one or more operators and variables/parameters
+    - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files:
+      - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h)
+      - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) 
+      - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h)
+- Optimizer: `fluid.optimizer.SGD`. It does the following
+  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/backward.cc)]
+  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py), [C++](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer)]
+# Run Time
+The following **evaluates** the NN. Instantiates all the variables, operators.
+```python
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+# Allocate memory. Initialize Parameter.
+exe.run(fluid.default_startup_program())
+# Allocate memory. Do computation.
+exe.run(fluid.default_main_program(),
+        feed=feeder.feed(data),
+        fetch_list=[avg_cost])
+```
+- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h)
+  - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h)
+- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)]
+  - Feeds the data: `feed=feeder.feed(data)`
+  - Evaluates all the operators
+  - Fetches the result: `fetch_list=[avg_cost]`
+- Other worth looking files:
+  - Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live
+    - Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live
+      - Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory)
--- a/doc/mobile/cross_compiling_for_ios_cn.md
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
@@ -18,11 +18,11 @@ PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/
 - `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须设置为`iOS`。在设置`CMAKE_SYSTEM_NAME=iOS`后，PaddlePaddle的CMake系统会自动编译所有的第三方依赖库，并且强制设置一些PaddlePaddle参数的值（`WITH_C_API=ON`、`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`）。
 - `WITH_C_API`，是否编译C-API预测库，必须设置为ON。在iOS平台上只支持使用C-API来预测。
- `WITH_SWIG_PY`，必须设置为ON。在iOS平台上不支持通过swig调用来训练或者预测。
+- `WITH_SWIG_PY`，必须设置为`OFF`。在iOS平台上不支持通过swig调用来训练或者预测。
 iOS平台可选配置参数：
- `IOS_PLATFORM`，可设置为`OS/SIMULATOR`，默认值为`OS`。
+- `IOS_PLATFORM`，可设置为`OS`（默认值）或`SIMULATOR`。
  - `OS`，构建目标为`arm`架构的iPhone或者iPad等物理设备。
  - `SIMULATOR`，构建目标为`x86`架构的模拟器平台。
 - `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示，默认编译所有架构：

--- a/doc/mobile/cross_compiling_for_ios_en.md
+++ b/doc/mobile/cross_compiling_for_ios_en.md
+# PaddlePaddle Compiling Guide for iOS
+This tutorial will walk you through cross compiling the PaddlePaddle library for iOS from the source in MacOS.
+## Preparation
+Apple provides Xcode for cross-compiling and IDE for iOS development. Download from App store or [here](https://developer.apple.com/cn/xcode/). To verify your installation, run command as follows
+```bash
+$ xcodebuild -version
+Xcode 9.0
+Build version 9A235
+```
+## Cross-compiling configurations
+PaddlePaddle provides cross-compiling toolchain configuration documentation [cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake), which has some default settings for frequently used compilers.
+There are some mandatory environment variables need to be set before cross compiling PaddlePaddle for iOS:
+- `CMAKE_SYSTEM_NAME`, CMake compiling target platform name, has to be `iOS`. PaddlePaddle CMake will compile all the third party dependencies and enforce some parameters (`WITH_C_API=ON`, `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`,`WITH_RDMA=OFF`) when this variable is set with value `iOS`.
+- `WITH_C_API`, Whether to compile inference C-API library, has to be `ON`, since C-API is the only supported interface for inferencing in iOS.
+- `WITH_SWIG_PY`, has to be `OFF`. It's not supported to inference or train via swig in iOS.
+Optional environment variables for iOS are:
+- `IOS_PLATFORM`, either `OS` (default) or `SIMULATOR`.
+  - `OS`, build targets ARM-based physical devices like iPhone or iPad.
+  - `SIMULATOR`, build targets x86 architecture simulators.
+- `IOS_ARCH`, target architecture. By default, all architecture types will be compiled. If you need to specify the architecture to compile for, please find valid values for different `IOS_PLATFORM` settings from the table below:
+    <table class="docutils">
+    <colgroup>
+      <col width="35%" />
+      <col width="65%" />
+    </colgroup>
+    <thead valign="bottom">
+      <tr class="row-odd">
+      <th class="head">IOS_PLATFORM</th>
+      <th class="head">IOS_ARCH</th>
+    </tr>
+    </thead>
+    <tbody valign="top">
+      <tr class="row-even">
+      <td>OS</td>
+      <td>armv7, armv7s, arm64 </td>
+    </tr>
+    <tr class="row-odd">
+      <td>SIMULATOR</td>
+      <td>i386, x86_64 </td>
+    </tr>
+    </tbody>
+    </table>
+- `IOS_DEPLOYMENT_TARGET`, minimum iOS version to deployment, `7.0` by default.
+- `IOS_ENABLE_BITCODE`, whether to enable [Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3), values can be `ON/OFF`, `ON` by default.
+- `IOS_USE_VECLIB_FOR_BLAS`, whether to use [vecLib](https://developer.apple.com/documentation/accelerate/veclib) framework for BLAS computing. values can be `ON/OFF`, `OFF` by default.
+- `IOS_DEVELOPMENT_ROOT`, the path to `Developer` directory, can be explicitly set with your `/path/to/platform/Developer`. If left blank, PaddlePaddle will automatically pick the Xcode corresponding `platform`'s `Developer` directory based on your `IOS_PLATFORM` value.
+- `IOS_SDK_ROOT`, the path to `SDK` root, can be explicitly set with your  `/path/to/platform/Developer/SDKs/SDK`. if left black, PaddlePaddle will pick the latest SDK in the directory of `IOS_DEVELOPMENT_ROOT`.
+other settings：
+- `USE_EIGEN_FOR_BLAS`, whether to use Eigen for matrix computing. effective when `IOS_USE_VECLIB_FOR_BLAS=OFF`. Values can be `ON/OFF`, `OFF` by default.
+- `HOST_C/CXX_COMPILER`, host C/C++ compiler. Uses value from environment variable `CC/CXX` by default or `cc/c++` if `CC/CXX` doesn't exist.
+some typical cmake configurations:
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=OS \
+      -DIOS_ARCH="armv7;arm64" \
+      -DIOS_ENABLE_BITCODE=ON \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=SIMULATOR \
+      -DIOS_ARCH="x86_64" \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+You can set other compiling parameters for your own need. I.E. if you are trying to minimize the library size, set `CMAKE_BUILD_TYPE` with `MinSizeRel`; or if the performance is your concern, set `CMAKE_BUILD_TYPE` with `Release`. You can even manipulate the PaddlePaddle compiling procedure by manually set `CMAKE_C/CXX_FLAGS` values.
+**TIPS for a better performance**:
+- set `CMAKE_BUILD_TYPE` with `Release`
+- set `IOS_USE_VECLIB_FOR_BLAS` with `ON`
+## Compile and install
+After CMake, run following commands, PaddlePaddle will download the compile 3rd party dependencies, compile and install PaddlePaddle inference library.
+```
+$ make
+$ make install
+```
+Please Note: if you compiled PaddlePaddle in the source directory for other platforms, do remove `third_party` and `build` directory within the source with `rm -rf` to ensure that all the 3rd party libraries dependencies and PaddlePaddle is newly compiled with current CMake configuration.
+`your/path/to/install` directory will have following directories after `compile` and `install`:
+- `include`, contains all the C-API header files.
+- `lib`, contains PaddlePaddle C-API static library.
+- `third_party` contains all the 3rd party libraries.
+Please note: if PaddlePaddle library need to support both physical devices and simulators, you will need to compile correspondingly, then merge fat library with `lipo`.
+Now you will have PaddlePaddle library compiled and installed, the fat library can be used in deep learning related iOS APPs. Please refer to C-API documentation for usage guides.
--- a/doc/mobile/index_en.rst
+++ b/doc/mobile/index_en.rst
@@ -5,4 +5,5 @@ MOBILE
  :maxdepth: 1
  cross_compiling_for_android_en.md
+  cross_compiling_for_ios_en.md
  cross_compiling_for_raspberry_en.md
--- a/paddle/capi/error.cpp
+++ b/paddle/capi/error.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "error.h"
-const char* paddle_error_string(paddle_error err) {
+extern "C" const char* paddle_error_string(paddle_error err) {
  switch (err) {
    case kPD_NULLPTR:
      return "nullptr error";

--- a/paddle/capi/error.h
+++ b/paddle/capi/error.h
@@ -29,9 +29,17 @@ typedef enum {
  kPD_UNDEFINED_ERROR = -1,
 } paddle_error;
+#ifdef __cplusplus
+extern "C" {
+#endif
 /**
 * Error string for Paddle API.
 */
 PD_API const char* paddle_error_string(paddle_error err);
+#ifdef __cplusplus
+}
+#endif
 #endif
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -58,3 +58,6 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
        proto_desc)
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
+cc_library(init SRCS init.cc DEPS gflags executor place stringpiece)
+cc_test(init_test SRCS init_test.cc DEPS init)
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -430,14 +430,14 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
    std::vector<std::unique_ptr<OpDescBind>> op_grads;
    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
-      int step_block_idx = (*it)->GetBlockAttr("step_block");
+      int step_block_idx = (*it)->GetBlockAttr("sub_block");
      BlockDescBind* backward_block = CreateStepBlock(
          program_desc, no_grad_vars, grad_to_var, step_block_idx);
      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
    } else if ((*it)->Type() == "conditional_block") {
      BlockDescBind* backward_block =
          CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
-                          (*it)->GetBlockAttr("block"));
+                          (*it)->GetBlockAttr("sub_block"));
      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
    } else {
      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);

--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 #include <sstream>
 #include <vector>

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -33,32 +33,12 @@ namespace framework {
 const std::string kFeedOpType = "feed";
 const std::string kFetchOpType = "fetch";
-Executor::Executor(const std::vector<platform::Place>& places) : own_(true) {
+DeviceContextPool* DeviceContextPool::pool = nullptr;
-  PADDLE_ENFORCE_GT(places.size(), 0);
-  device_contexts_.resize(places.size());
-  for (size_t i = 0; i < places.size(); i++) {
-    if (platform::is_cpu_place(places[i])) {
-      device_contexts_[i] = new platform::CPUDeviceContext(
-          boost::get<platform::CPUPlace>(places[i]));
-    } else if (platform::is_gpu_place(places[i])) {
-#ifdef PADDLE_WITH_CUDA
-      device_contexts_[i] = new platform::CUDADeviceContext(
-          boost::get<platform::GPUPlace>(places[i]));
-#else
-      PADDLE_THROW(
-          "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
-          "option");
-#endif
-    }
-  }
-}
-Executor::~Executor() {
+Executor::Executor(const std::vector<platform::Place>& places) {
-  if (own_) {
+  DeviceContextPool& pool = DeviceContextPool::Get();
-    for (auto& device_context : device_contexts_) {
+  auto borrowed_contexts = pool.Borrow(places);
-      delete device_context;
+  device_contexts_.swap(borrowed_contexts);
-    }
-  }
 }
 static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
@@ -132,8 +112,5 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
  }
 }
-Executor::Executor(const platform::DeviceContext& device)
-    : device_contexts_({&device}), own_(false) {}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -14,19 +14,98 @@ limitations under the License. */
 #pragma once
+#include <map>
+#include <unordered_map>
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
 namespace paddle {
 namespace framework {
+class DeviceContextPool {
+ public:
+  static DeviceContextPool& Get() {
+    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
+    return *pool;
+  }
+  static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
+    if (pool == nullptr) {
+      pool = new DeviceContextPool(places);
+    }
+    return *pool;
+  }
+  std::vector<const platform::DeviceContext*> Borrow(
+      const std::vector<platform::Place>& places) {
+    PADDLE_ENFORCE_GT(places.size(), 0);
+    PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
+    std::vector<const platform::DeviceContext*> borrowed_contexts;
+    for (auto& place : places) {
+      auto range = device_contexts_.equal_range(place);
+      if (range.first == range.second) {
+        PADDLE_THROW(
+            "'Place' is not supported, Please re-compile with WITH_GPU "
+            "option");
+      }
+      // TODO(dzhwinter) : assign the first found device. Will enhanced later.
+      // device load balancer maybe useful here.
+      borrowed_contexts.emplace_back(range.first->second);
+    }
+    return borrowed_contexts;
+  }
+  explicit DeviceContextPool(const std::vector<platform::Place>& places) {
+    PADDLE_ENFORCE_GT(places.size(), 0);
+    for (size_t i = 0; i < places.size(); i++) {
+      if (platform::is_cpu_place(places[i])) {
+        device_contexts_.emplace(
+            places[i], new platform::CPUDeviceContext(
+                           boost::get<platform::CPUPlace>(places[i])));
+      } else if (platform::is_gpu_place(places[i])) {
+#ifdef PADDLE_WITH_CUDA
+        device_contexts_.emplace(
+            places[i], new platform::CUDADeviceContext(
+                           boost::get<platform::GPUPlace>(places[i])));
+#else
+        PADDLE_THROW(
+            "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
+            "option");
+#endif
+      }
+    }
+  }
+  ~DeviceContextPool() {}
+ private:
+  static DeviceContextPool* pool;
+  struct Hash {
+    std::hash<int> hash_;
+    size_t operator()(const platform::Place& place) const {
+      return hash_(place.which());
+    }
+  };
+  std::unordered_multimap<const platform::Place, const platform::DeviceContext*,
+                          Hash>
+      device_contexts_;
+  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
+};
 class Executor {
 public:
+  // TODO(dzhwinter) : Do not rely on this function, it will be removed
+  explicit Executor(const platform::DeviceContext& device)
+      : Executor(std::vector<platform::Place>({device.GetPlace()})) {}
+  explicit Executor(const platform::Place& place)
+      : Executor(std::vector<platform::Place>({place})) {}
  explicit Executor(const std::vector<platform::Place>& places);
-  explicit Executor(const platform::DeviceContext& devices);
-  ~Executor();
  /* @Brief
   * Runtime evaluation of the given ProgramDesc under certain Scope
@@ -39,7 +118,6 @@ class Executor {
 private:
  std::vector<const platform::DeviceContext*> device_contexts_;
-  bool own_;
 };
 }  // namespace framework

--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <algorithm>
+#include <string>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/init.h"
+#include "paddle/platform/place.h"
+#include "paddle/string/piece.h"
+namespace paddle {
+namespace framework {
+std::once_flag gflags_init_flag;
+// TODO(qijun) move init gflags to init.cc
+void InitGflags(std::vector<std::string> &argv) {
+  std::call_once(gflags_init_flag, [&]() {
+    int argc = argv.size();
+    char **arr = new char *[argv.size()];
+    std::string line;
+    for (size_t i = 0; i < argv.size(); i++) {
+      arr[i] = &argv[i][0];
+      line += argv[i];
+      line += ' ';
+    }
+    google::ParseCommandLineFlags(&argc, &arr, true);
+    VLOG(1) << "Init commandline: " << line;
+  });
+}
+bool InitDevices(const std::vector<std::string> &devices) {
+  // device format
+  // CPU
+  // GPU:1
+  // TODO(dzhwinter) : add device format annotation for users.
+  std::vector<platform::Place> places;
+  for (auto &device : devices) {
+    auto p = string::Piece(device);
+    if (string::Find(p, ':', 0) == string::Piece::npos) {
+      places.emplace_back(platform::CPUPlace());
+    } else if (string::HasPrefix(p, "GPU")) {
+#ifdef PADDLE_WITH_CUDA
+      auto pos = string::RFind(p, ':', string::Piece::npos);
+      auto number = device.substr(pos + 1);
+      places.emplace_back(platform::GPUPlace(std::stoi(number)));
+#else
+      LOG(WARNING)
+          << "'GPU' is not supported, Please re-compile with WITH_GPU option";
+#endif
+    } else {
+      return false;
+    }
+  }
+  if (std::find_if(places.begin(), places.end(),
+                   [&](const platform::Place &place) {
+                     return platform::is_cpu_place(place);
+                   }) == places.end()) {
+    places.emplace_back(platform::CPUPlace());
+    LOG(WARNING) << "Not specified any device, use CPU by Default.";
+  }
+  DeviceContextPool::Create(places);
+  return true;
+  return true;
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/init.h
+++ b/paddle/framework/init.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <mutex>
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+namespace paddle {
+namespace framework {
+void InitGflags(std::vector<std::string> &argv);
+bool InitDevices(const std::vector<std::string> &devices);
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/init_test.cc
+++ b/paddle/framework/init_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "gtest/gtest.h"
+#include "paddle/framework/init.h"
+TEST(Init, InitDevices) {
+  using paddle::framework::InitDevices;
+  std::vector<std::string> ds1 = {"CPU"};
+  ASSERT_EQ(InitDevices(ds1), true);
+#ifdef PADDLE_WITH_CUDA
+  std::vector<std::string> ds2 = {"CPU", "GPU:0", "GPU:1"};
+  ASSERT_EQ(InitDevices(ds2), true);
+#endif
+}
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -181,8 +181,8 @@ class OpKernelRegistrar : public Registrar {
    return 0;                                                             \
  }
-#define REGISTER_OP_GPU_KERNEL(op_type, ...) \
+#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
-  REGISTER_OP_KERNEL(op_type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
+  REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__)
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
@@ -217,7 +217,7 @@ class OpKernelRegistrar : public Registrar {
 #else
 #define USE_OP_KERNEL(op_type)        \
  USE_OP_DEVICE_KERNEL(op_type, CPU); \
-  USE_OP_DEVICE_KERNEL(op_type, GPU)
+  USE_OP_DEVICE_KERNEL(op_type, CUDA)
 #endif
 #define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
@@ -226,9 +226,9 @@ class OpKernelRegistrar : public Registrar {
  USE_OP_ITSELF(op_type);        \
  USE_OP_DEVICE_KERNEL(op_type, CPU);
-#define USE_GPU_ONLY_OP(op_type) \
+#define USE_CUDA_ONLY_OP(op_type) \
-  USE_OP_ITSELF(op_type);        \
+  USE_OP_ITSELF(op_type);         \
-  USE_OP_DEVICE_KERNEL(op_type, GPU)
+  USE_OP_DEVICE_KERNEL(op_type, CUDA)
 #define USE_OP(op_type)   \
  USE_OP_ITSELF(op_type); \

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,20 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-template <>
-Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
-    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return *device_context_.GetEigenDevice<platform::CPUPlace>();
-}
-#ifdef PADDLE_WITH_CUDA
-template <>
-Eigen::GpuDevice&
-ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return *device_context_.GetEigenDevice<platform::GPUPlace>();
-}
-#endif
 std::string OperatorBase::Input(const std::string& name) const {
  auto& ins = Inputs(name);
  PADDLE_ENFORCE_LE(ins.size(), 1UL,
@@ -429,7 +415,7 @@ void OperatorWithKernel::Run(const Scope& scope,
 }
 OpKernelType OperatorWithKernel::GetKernelType(
    const ExecutionContext& ctx) const {
-  return OpKernelType(IndicateDataType(ctx), ctx.device_context());
+  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
 }
 DataType OperatorWithKernel::IndicateDataType(
    const ExecutionContext& ctx) const {

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -276,17 +276,25 @@ class ExecutionContext {
    out_tensor->set_lod(in_tensor.lod());
  }
-  template <typename PlaceType,
-            typename DeviceType = typename platform::EigenDeviceConverter<
-                PlaceType>::EigenDeviceType>
-  DeviceType& GetEigenDevice() const;
  platform::Place GetPlace() const { return device_context_.GetPlace(); }
+  template <typename DeviceContextType>
+  const DeviceContextType& device_context() const {
+    return *reinterpret_cast<const DeviceContextType*>(&device_context_);
+  }
  const platform::DeviceContext& device_context() const {
    return device_context_;
  }
+#ifdef PADDLE_WITH_CUDA
+  const inline platform::CUDADeviceContext& cuda_device_context() const {
+    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
+    return *reinterpret_cast<const platform::CUDADeviceContext*>(
+        &device_context_);
+  }
+#endif
  //! Get actual name vector for this input.
  const std::vector<std::string>& Inputs(const std::string& name) const {
    return op_.Inputs(name);
@@ -297,14 +305,6 @@ class ExecutionContext {
    return op_.Outputs(name);
  }
-#ifdef PADDLE_WITH_CUDA
-  const inline platform::CUDADeviceContext& cuda_device_context() const {
-    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
-    return *reinterpret_cast<const platform::CUDADeviceContext*>(
-        &device_context_);
-  }
-#endif
 private:
  const OperatorBase& op_;
  const Scope& scope_;

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -115,7 +115,7 @@ class OpWithKernelTest : public OperatorWithKernel {
 protected:
  void InferShape(framework::InferShapeContext* ctx) const override {}
  OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
-    return OpKernelType(DataType::FP32, ctx.device_context());
+    return OpKernelType(DataType::FP32, ctx.GetPlace());
  }
 };

--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -126,6 +126,11 @@ public:
      inputData += inputChannels * inputHeight * inputWidth;
      outputData += outputChannels * outputHeight * outputWidth;
    }
+#ifdef PADDLE_MOBILE_INFERENCE
+    if (Device == DEVICE_TYPE_CPU) {
+      delete memory_;
+    }
+#endif
  }
 };

--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/utils/ClassRegistrar.h"
 #include "paddle/utils/Logging.h"
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
 #include "MKLDNNActivation.h"
 #endif
@@ -490,7 +490,7 @@ Error __must_check backward(Argument& act) {
 END_DEFINE_ACTIVATION(log)
 ActivationFunction* ActivationFunction::create(const std::string& type) {
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
  if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) {
    return MKLDNNActivation::create(type);
  }

--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
 #include "paddle/gserver/layers/MKLDNNLayer.h"
 #endif
@@ -307,7 +307,7 @@ void NeuralNetwork::backward(const UpdateCallback& callback) {
 }
 void NeuralNetwork::finish() {
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
  FOR_EACH_R(layer, layers_) {
    MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(*layer);
    if (dnnLayer) {

--- a/paddle/gserver/layers/ROIPoolLayer.cpp
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
@@ -84,12 +84,15 @@ void ROIPoolLayer::forward(PassType passType) {
  size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
  real* outputData = outputValue->getData();
-  Matrix::resizeOrCreate(maxIdxs_,
+  real* argmaxData = nullptr;
-                         numROIs,
+  if (passType != PASS_TEST) {
-                         channels_ * pooledHeight_ * pooledWidth_,
+    Matrix::resizeOrCreate(maxIdxs_,
-                         false,
+                           numROIs,
-                         false);
+                           channels_ * pooledHeight_ * pooledWidth_,
-  real* argmaxData = maxIdxs_->getData();
+                           false,
+                           false);
+    argmaxData = maxIdxs_->getData();
+  }
  for (size_t n = 0; n < numROIs; ++n) {
    // the first five elememts of each RoI should be:
@@ -128,14 +131,18 @@ void ROIPoolLayer::forward(PassType passType) {
          bool isEmpty = (hend <= hstart) || (wend <= wstart);
          size_t poolIndex = ph * pooledWidth_ + pw;
          outputData[poolIndex] = isEmpty ? 0 : -FLT_MAX;
-          argmaxData[poolIndex] = -1;
+          if (argmaxData) {
+            argmaxData[poolIndex] = -1;
+          }
          for (size_t h = hstart; h < hend; ++h) {
            for (size_t w = wstart; w < wend; ++w) {
              size_t index = h * width_ + w;
              if (batchData[index] > outputData[poolIndex]) {
                outputData[poolIndex] = batchData[index];
-                argmaxData[poolIndex] = index;
+                if (argmaxData) {
+                  argmaxData[poolIndex] = index;
+                }
              }
            }
          }
@@ -143,7 +150,9 @@ void ROIPoolLayer::forward(PassType passType) {
      }
      batchData += channelOffset;
      outputData += poolChannelOffset;
-      argmaxData += poolChannelOffset;
+      if (argmaxData) {
+        argmaxData += poolChannelOffset;
+      }
    }
    bottomROIs += roiOffset;
  }

--- a/paddle/gserver/layers/SequenceToBatch.cpp
+++ b/paddle/gserver/layers/SequenceToBatch.cpp
@@ -171,12 +171,31 @@ void SequenceToBatch::sequence2BatchCopy(Matrix &batch,
    hl_sequence2batch_copy(
        batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
  } else {
-    for (int i = 0; i < batchCount; ++i) {
+    if (seq2batch) {
-      if (seq2batch) {
+#ifdef PADDLE_USE_MKLML
+      const int blockMemSize = 8 * 1024;
+      const int blockSize = blockMemSize / sizeof(real);
+#pragma omp parallel for collapse(2)
+      for (int i = 0; i < batchCount; ++i) {
+        for (int j = 0; j < seqWidth; j += blockSize) {
+          memcpy(batch.rowBuf(i) + j,
+                 sequence.rowBuf(idxData[i]) + j,
+                 (j + blockSize > seqWidth) ? (seqWidth - j) * sizeof(real)
+                                            : blockMemSize);
+        }
+      }
+#else
+      for (int i = 0; i < batchCount; ++i) {
        memcpy(batch.rowBuf(i),
               sequence.rowBuf(idxData[i]),
               seqWidth * sizeof(real));
-      } else {
+      }
+#endif
+    } else {
+#ifdef PADDLE_USE_MKLML
+#pragma omp parallel for
+#endif
+      for (int i = 0; i < batchCount; ++i) {
        memcpy(sequence.rowBuf(idxData[i]),
               batch.rowBuf(i),
               seqWidth * sizeof(real));

--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -48,7 +48,7 @@ public:
   */
  virtual void* alloc(size_t size) {
    void* ptr;
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
    // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
    // memory alignment
    CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);

--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -206,7 +206,7 @@ double dotProduct<double>(const int n, const double* x, const double* y) {
 }
 #endif
-#if defined(PADDLE_USE_MKLML)
+#if defined(PADDLE_WITH_MKLML)
 template <>
 void vExp<float>(const int n, const float* a, float* r) {

--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifndef MATHFUNCTIONS_H_
 #define MATHFUNCTIONS_H_
-#ifdef PADDLE_USE_MKLML
+#ifdef PADDLE_WITH_MKLML
 #include <mkl_cblas.h>
 #include <mkl_lapacke.h>
 #include <mkl_vml_functions.h>

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "hl_top_k.h"
 #include "paddle/utils/Logging.h"
+#include "NEONFunctions.h"
 #include "paddle/function/GemmFunctor.h"
 #include "paddle/utils/ThreadLocal.h"
@@ -4165,16 +4166,36 @@ void CpuMatrix::print(std::ostream& os) const {
 void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
  real* input = data.getData();
  real* w = W.getData();
+  real* output = data_;
  size_t numElements = data.getWidth();
  size_t numSamples = data.getHeight();
  size_t paraSize = W.getHeight() * W.getWidth();
  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
  size_t partial_sum = numElements / paraSize;
+  if (paraSize == numElements) {
+    for (size_t n = 0; n < numSamples * numElements; ++n) {
+      output[n] = input[n] > 0 ? input[n] : input[n] * w[n % numElements];
+    }
+    return;
+  }
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  for (size_t n = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < paraSize; i++) {
+      neon::prelu(
+          input + i * partial_sum, w[i], output + i * partial_sum, partial_sum);
+    }
+    input = input + numElements;
+    output = output + numElements;
+  }
+#else
  for (size_t n = 0, k = 0; n < numSamples; ++n) {
    for (size_t i = 0; i < numElements; ++i, ++k) {
-      data_[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
+      output[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
    }
  }
+#endif
 }
 void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {

--- a/paddle/math/NEONFunctions.cpp
+++ b/paddle/math/NEONFunctions.cpp
@@ -49,6 +49,46 @@ void relu(const float* a, float* b, int len) {
  }
 }
+// b[i] = a[i] > 0.0f ? a[i] : a[i] * w
+void prelu(const float* a, float w, float* b, int len) {
+  int offset = len % 16;
+  float32x4_t ma0, ma1, ma2, ma3;
+  float32x4_t zero = vdupq_n_f32(0.f);
+  float32x4_t vw = vdupq_n_f32(w);
+  for (int k = 0; k < len / 16; k++, a += 16, b += 16) {
+    ma0 = vld1q_f32(a);
+    ma1 = vld1q_f32(a + 4);
+    ma2 = vld1q_f32(a + 8);
+    ma3 = vld1q_f32(a + 12);
+    uint32x4_t flag0 = vcgtq_f32(ma0, zero);
+    uint32x4_t flag1 = vcgtq_f32(ma1, zero);
+    uint32x4_t flag2 = vcgtq_f32(ma2, zero);
+    uint32x4_t flag3 = vcgtq_f32(ma3, zero);
+    float32x4_t mul0 = vmulq_f32(ma0, vw);
+    float32x4_t mul1 = vmulq_f32(ma1, vw);
+    float32x4_t mul2 = vmulq_f32(ma2, vw);
+    float32x4_t mul3 = vmulq_f32(ma3, vw);
+    ma0 = vbslq_f32(flag0, ma0, mul0);
+    ma1 = vbslq_f32(flag1, ma1, mul1);
+    ma2 = vbslq_f32(flag2, ma2, mul2);
+    ma3 = vbslq_f32(flag3, ma3, mul3);
+    vst1q_f32(b, ma0);
+    vst1q_f32(b + 4, ma1);
+    vst1q_f32(b + 8, ma2);
+    vst1q_f32(b + 12, ma3);
+  }
+  for (int i = 0; i < offset; i++) {
+    b[i] = a[i] > 0.0f ? a[i] : a[i] * w;
+  }
+}
 }  // namespace neon
 }  // namespace paddle

--- a/paddle/math/NEONFunctions.h
+++ b/paddle/math/NEONFunctions.h
@@ -18,6 +18,7 @@ namespace paddle {
 namespace neon {
 void relu(const float* a, float* b, int len);
+void prelu(const float* a, float w, float* b, int len);
 }  // namespace neon
 }  // namespace paddle
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -79,7 +79,7 @@ public:
 #ifdef PADDLE_CUDA_FP16
  HOSTDEVICE inline explicit float16(const half& h) {
 #if CUDA_VERSION >= 9000
-    x = reinterpret_cast<__half_raw*>(&h)->x;
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
 #else
    x = h.x;
 #endif  // CUDA_VERSION >= 9000
@@ -101,7 +101,7 @@ public:
    half tmp = __float2half(val);
    x = *reinterpret_cast<uint16_t*>(&tmp);
-#elif defined(PADDLE_NEON)
+#elif defined(PADDLE_WITH_NATIVE_FP16)
    float32x4_t tmp = vld1q_dup_f32(&val);
    float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
    x = *reinterpret_cast<uint16_t*>(&res);
@@ -145,7 +145,7 @@ public:
 #ifdef PADDLE_CUDA_FP16
  HOSTDEVICE inline float16& operator=(const half& rhs) {
 #if CUDA_VERSION >= 9000
-    x = reinterpret_cast<__half_raw*>(&rhs)->x;
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
 #else
    x = rhs.x;
 #endif
@@ -252,7 +252,7 @@ public:
    half tmp = *reinterpret_cast<const half*>(this);
    return __half2float(tmp);
-#elif defined(PADDLE_NEON)
+#elif defined(PADDLE_WITH_NATIVE_FP16)
    float16x4_t res = vld1_dup_f16(reinterpret_cast<const float16_t*>(this));
    return vgetq_lane_f32(vcvt_f32_f16(res), 0);

--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -244,7 +244,7 @@ TEST(Matrix, unary) {
    LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK"
                 << "support so we cannot test matrix inverse. To test "
                 << "matrix inverse, please install LAPACKE "
-                 << "and MKL/Openblas/ATLAS, and re-build PaddlePaddle.";
+                 << "and MKL/Openblas, and re-build PaddlePaddle.";
 #endif
  }
 }

--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock
+#include <algorithm>   // for std::max
 #include "gflags/gflags.h"
@@ -28,7 +29,7 @@ limitations under the License. */
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
 DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+DECLARE_double(fraction_of_gpu_memory_to_use);
 namespace paddle {
 namespace memory {
 namespace detail {
@@ -43,7 +44,7 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) {
  void* p;
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
  // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
  // memory alignment
  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0);
@@ -77,45 +78,20 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
  // CUDA documentation doesn't explain if cudaMalloc returns nullptr
  // if size is 0.  We just make sure it does.
  if (size <= 0) return nullptr;
+  void* p;
-  size_t available = 0;
+  cudaError_t result = cudaMalloc(&p, size);
-  size_t capacity = 0;
-  paddle::platform::GpuMemoryUsage(available, capacity);
-  // Reserve memory for page tables, etc.
-  size_t reserving = 0.05 * capacity + paddle::platform::GpuMinChunkSize();
-  size_t usable = available > reserving ? available - reserving : 0;
-  // If remaining size no less than expected size, using general
-  // cudaMalloc to allocate GPU memory.
-  void* p = 0;
-  if (size <= usable) {
-    cudaError_t result = cudaMalloc(&p, size);
-    if (result == cudaSuccess) {
-      index = 0;
-      gpu_alloc_size_ += size;
-      return p;
-    }
-  }
-  // If remaining size less than expected size or cudaMalloc failed,
-  // cudaMallocHost will be considered as a fallback allocator.
-  //
-  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
-  // of host fallback allocation. Allocates too much would reduce
-  // the amount of memory available to the underlying system for paging.
-  usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
-  if (size > usable) return nullptr;
-  cudaError_t result = cudaMallocHost(&p, size);
  if (result == cudaSuccess) {
-    index = 1;
+    index = 0;
-    fallback_alloc_size_ += size;
+    gpu_alloc_size_ += size;
    return p;
+  } else {
+    LOG(WARNING)
+        << "Cannot malloc " << size / 1024.0 / 1024.0
+        << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use "
+           "environment variable to a lower value. Current value is "
+        << FLAGS_fraction_of_gpu_memory_to_use;
+    return nullptr;
  }
-  return nullptr;
 }
 void GPUAllocator::Free(void* p, size_t size, size_t index) {

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -138,7 +138,7 @@ function(op_library TARGET)
    if ("${TARGET}" STREQUAL "nccl_op")
        set(pybind_flag 1)
        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
+        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
    endif()
    # reduce_op contains several operators

--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -57,7 +57,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
-        ctx.device_context());
+        ctx.GetPlace());
  }
 };

--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -104,5 +104,6 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 // FIXME(typhoonzero): types of T is for inference data.
 // label data is always int64
-REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
+REGISTER_OP_CUDA_KERNEL(accuracy,
-                       paddle::operators::AccuracyOpCUDAKernel<double>);
+                        paddle::operators::AccuracyOpCUDAKernel<float>,
+                        paddle::operators::AccuracyOpCUDAKernel<double>);
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@@ -21,7 +21,7 @@ namespace operators {
 using Tensor = framework::Tensor;
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AccuracyKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -611,16 +611,17 @@ REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
 REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
            ops::ActivationOpGrad);
-#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)       \
+#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)   \
-  REGISTER_OP_CPU_KERNEL(                                                     \
+  REGISTER_OP_CPU_KERNEL(                                                 \
-      act_type,                                                               \
+      act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
-      ops::ActivationKernel<paddle::platform::CPUPlace, ops::functor<float>>, \
+                                      ops::functor<float>>,               \
-      ops::ActivationKernel<paddle::platform::CPUPlace,                       \
+      ops::ActivationKernel<paddle::platform::CPUDeviceContext,           \
-                            ops::functor<double>>);                           \
+                            ops::functor<double>>);                       \
-  REGISTER_OP_CPU_KERNEL(                                                     \
+  REGISTER_OP_CPU_KERNEL(                                                 \
-      act_type##_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace,  \
+      act_type##_grad,                                                    \
-                                                 ops::grad_functor<float>>,   \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
-      ops::ActivationGradKernel<paddle::platform::CPUPlace,                   \
+                                ops::grad_functor<float>>,                \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
                                ops::grad_functor<double>>);
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@@ -17,16 +17,17 @@
 namespace ops = paddle::operators;
-#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor)       \
+#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor)   \
-  REGISTER_OP_GPU_KERNEL(                                                     \
+  REGISTER_OP_CUDA_KERNEL(                                                 \
-      act_type,                                                               \
+      act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext, \
-      ops::ActivationKernel<paddle::platform::GPUPlace, ops::functor<float>>, \
+                                      ops::functor<float>>,                \
-      ops::ActivationKernel<paddle::platform::GPUPlace,                       \
+      ops::ActivationKernel<paddle::platform::CUDADeviceContext,           \
-                            ops::functor<double>>);                           \
+                            ops::functor<double>>);                        \
-  REGISTER_OP_GPU_KERNEL(                                                     \
+  REGISTER_OP_CUDA_KERNEL(                                                 \
-      act_type##_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace,  \
+      act_type##_grad,                                                     \
-                                                 ops::grad_functor<float>>,   \
+      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
-      ops::ActivationGradKernel<paddle::platform::GPUPlace,                   \
+                                ops::grad_functor<float>>,                 \
+      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
                                ops::grad_functor<double>>);
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL);
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class ActivationKernel
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
 public:
@@ -32,18 +32,19 @@ class ActivationKernel
    auto x = framework::EigenVector<T>::Flatten(*X);
    auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
    Functor functor;
    auto attrs = functor.GetAttrs();
    for (auto& attr : attrs) {
      *attr.second = context.Attr<float>(attr.first);
    }
-    functor(place, x, y);
+    functor(*place, x, y);
  }
 };
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class ActivationGradKernel
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
 public:
@@ -59,13 +60,14 @@ class ActivationGradKernel
    auto x = framework::EigenVector<T>::Flatten(*X);
    auto y = framework::EigenVector<T>::Flatten(*Y);
    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
    Functor functor;
    auto attrs = functor.GetAttrs();
    for (auto& attr : attrs) {
      *attr.second = context.Attr<float>(attr.first);
    }
-    functor(place, x, y, dy, dx);
+    functor(*place, x, y, dy, dx);
  }
 };

--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -92,12 +92,12 @@ for gradient descent.
 Adadelta updates are as follows:
-$$avgSquaredGradOut = \rho * avgSquaredGrad + (1 - \rho) * grad * grad \break
+$$
-paramUpdate =  - $\sqrt{((avgSquaredUpdate + \epsilon) /
+avg\_squared\_grad\_out = \rho * avg\_squared\_grad + (1 - \rho) * grad * grad \\
-                       (avgSquaredGrad_out + \epsilon))}$ * grad \break
+param\_update =  - \sqrt{\frac{avg\_squared\_update + \epsilon}{avg\_squared\_grad\_out + \epsilon}} * grad \\
-avgSquaredUpdateOut = \rho * avgSquaredUpdate + (1 - \rho) *
+avg\_squared\_update\_out = \rho * avg\_squared\_update + (1 - \rho) * {param\_update}^2 \\
-                                  {(paramUpdate)}^2 \break
+param\_out = param + param\_update
-paramOut = param + paramUpdate$$
+$$
 )DOC");
  }
@@ -109,5 +109,5 @@ paramOut = param + paramUpdate$$
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>,
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdadeltaOpKernel<paddle::platform::CPUPlace, double>);
+    ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/operators/adadelta_op.cu
+++ b/paddle/operators/adadelta_op.cu
@@ -16,6 +16,6 @@
 #include "paddle/operators/adadelta_op.h"
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>,
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdadeltaOpKernel<paddle::platform::GPUPlace, double>);
+    ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/operators/adadelta_op.h
+++ b/paddle/operators/adadelta_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdadeltaOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -51,7 +51,7 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
        framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
    auto avg_squared_update_out =
        framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
    avg_squared_grad_out.device(place) =
        rho * avg_squared_grad + (1 - rho) * grad.square();

--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -80,8 +80,8 @@ Adaptive Gradient Algorithm (Adagrad).
 The update is done as follows:
-$$momentOut = moment + grad * grad \break
+$$moment\_out = moment + grad * grad \\
-paramOut = param - learningRate * grad / ($\sqrt{momentOut}$ + \epsilon) \break
+param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
 $$
 The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
@@ -100,8 +100,8 @@ size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
 }  // namespace
 template <typename T>
-struct SparseAdagradFunctor<platform::CPUPlace, T> {
+struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                  const framework::SelectedRows& grad,
                  const framework::Tensor& learning_rate, T epsilon,
                  framework::Tensor* moment, framework::Tensor* param) {
@@ -120,7 +120,7 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
            {static_cast<int64_t>(merge_rows.size()), grad_width}),
        context.GetPlace());
-    math::SetConstant<platform::CPUPlace, T> constant_functor;
+    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
    constant_functor(context, grad_merge->mutable_value(), 0.0);
    auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
@@ -144,9 +144,9 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
    auto gs =
        framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
    auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
-    gs.device(*context.GetEigenDevice<platform::CPUPlace>()) = gm * gm;
+    gs.device(*context.eigen_device()) = gm * gm;
-    math::SelectedRowsAddToTensor<platform::CPUPlace, T> functor;
+    math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
    functor(context, *grad_square, moment);
    // 3. update parameter
@@ -164,13 +164,13 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
  }
 };
-template struct SparseAdagradFunctor<platform::CPUPlace, float>;
+template struct SparseAdagradFunctor<platform::CPUDeviceContext, float>;
-template struct SparseAdagradFunctor<platform::CPUPlace, double>;
+template struct SparseAdagradFunctor<platform::CPUDeviceContext, double>;
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::CPUPlace, float>,
+    adagrad, ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdagradOpKernel<paddle::platform::CPUPlace, double>);
+    ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -72,8 +72,8 @@ __global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
 }  // namespace
 template <typename T>
-struct SparseAdagradFunctor<platform::GPUPlace, T> {
+struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                  const framework::SelectedRows& grad,
                  const framework::Tensor& learning_rate, T epsilon,
                  framework::Tensor* moment, framework::Tensor* param) {
@@ -92,7 +92,7 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
            {static_cast<int64_t>(merge_rows.size()), grad_width}),
        context.GetPlace());
-    math::SetConstant<platform::GPUPlace, T> constant_functor;
+    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
    constant_functor(context, grad_merge->mutable_value(), 0.0);
    auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
@@ -119,9 +119,9 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
    auto gs =
        framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
    auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
-    gs.device(*context.GetEigenDevice<platform::GPUPlace>()) = gm * gm;
+    gs.device(*context.eigen_device()) = gm * gm;
-    math::SelectedRowsAddToTensor<platform::GPUPlace, T> functor;
+    math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
    functor(context, *grad_square, moment);
    // 3. update parameter
@@ -139,13 +139,13 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
  }
 };
-template struct SparseAdagradFunctor<platform::GPUPlace, float>;
+template struct SparseAdagradFunctor<platform::CUDADeviceContext, float>;
-template struct SparseAdagradFunctor<platform::GPUPlace, double>;
+template struct SparseAdagradFunctor<platform::CUDADeviceContext, double>;
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::GPUPlace, float>,
+    adagrad, ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdagradOpKernel<paddle::platform::GPUPlace, double>);
+    ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/operators/adagrad_op.h
+++ b/paddle/operators/adagrad_op.h
@@ -19,15 +19,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SparseAdagradFunctor {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                  const framework::SelectedRows& grad,
                  const framework::Tensor& learning_rate, T epsilon,
                  framework::Tensor* moment, framework::Tensor* param);
 };
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdagradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -52,11 +52,11 @@ class AdagradOpKernel : public framework::OpKernel<T> {
      auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
      auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-      auto place = ctx.GetEigenDevice<Place>();
+      auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-      moment_out.device(place) = moment + grad * grad;
+      moment_out.device(*place) = moment + grad * grad;
      Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-      param_out.device(place) =
+      param_out.device(*place) =
          param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
    } else if (grad_var->IsType<framework::SelectedRows>()) {
      auto* param_tensor = ctx.Input<framework::Tensor>("Param");
@@ -65,8 +65,9 @@ class AdagradOpKernel : public framework::OpKernel<T> {
      auto* moment_tensor = ctx.Input<framework::Tensor>("Moment");
      PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);
-      SparseAdagradFunctor<Place, T> functor;
+      SparseAdagradFunctor<DeviceContext, T> functor;
-      functor(ctx.device_context(), *ctx.Input<framework::SelectedRows>("Grad"),
+      functor(ctx.template device_context<DeviceContext>(),
+              *ctx.Input<framework::SelectedRows>("Grad"),
              *ctx.Input<framework::Tensor>("LearningRate"), epsilon,
              moment_out_tensor, param_out_tensor);
    } else {

--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@@ -112,11 +112,13 @@ adaptive estimates of lower-order moments.
 Adam updates:
-$$moment_1_{out} = \beta_1 * moment_1 + (1 - \beta_1) * grad \break
+$$
-moment_2_{out} = \beta_2 * moment_2 + (1 - \beta_2) * grad * grad \break
+moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
-learningRate = learningRate *
+moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
-                  $\sqrt{(1 - \beta_2_{pow})}$ / (1 - \beta_1_{pow}) \break
+learning\_rate = learning\_rate *
-paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
+                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+$$
 )DOC");
  }
@@ -126,6 +128,6 @@ paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
-REGISTER_OP_CPU_KERNEL(adam,
+REGISTER_OP_CPU_KERNEL(
-                       ops::AdamOpKernel<paddle::platform::CPUPlace, float>,
+    adam, ops::AdamOpKernel<paddle::platform::CPUDeviceContext, float>,
-                       ops::AdamOpKernel<paddle::platform::CPUPlace, double>);
+    ops::AdamOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/operators/adam_op.cu
+++ b/paddle/operators/adam_op.cu
@@ -16,6 +16,6 @@
 #include "paddle/operators/adam_op.h"
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(adam,
+REGISTER_OP_CUDA_KERNEL(
-                       ops::AdamOpKernel<paddle::platform::GPUPlace, float>,
+    adam, ops::AdamOpKernel<paddle::platform::CUDADeviceContext, float>,
-                       ops::AdamOpKernel<paddle::platform::GPUPlace, double>);
+    ops::AdamOpKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdamOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -52,17 +52,17 @@ class AdamOpKernel : public framework::OpKernel<T> {
    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
    auto moment1_out = framework::EigenVector<T>::Flatten(*moment1_out_tensor);
    auto moment2_out = framework::EigenVector<T>::Flatten(*moment2_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-    moment1_out.device(place) = beta1 * moment1 + (1 - beta1) * grad;
+    moment1_out.device(*place) = beta1 * moment1 + (1 - beta1) * grad;
-    moment2_out.device(place) = beta2 * moment2 + (1 - beta2) * grad.square();
+    moment2_out.device(*place) = beta2 * moment2 + (1 - beta2) * grad.square();
    // All of these are tensors of 1 element
    auto lr_t = lr * (1 - beta2_pow).sqrt() / (1 - beta1_pow);
    // Eigen does not support automatic broadcast
    // Get dimensions of moment vector to broadcast lr_t
    Eigen::DSizes<int, 1> m_dsize(moment1_out_tensor->numel());
-    param_out.device(place) =
+    param_out.device(*place) =
        param -
        lr_t.broadcast(m_dsize) *
            (moment1_out / (moment2_out.sqrt() + epsilon));

--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@@ -108,10 +108,10 @@ Adam algorithm based on the infinity norm.
 Adamax updates:
 $$
-  momentOut = \beta_{1} * moment + (1 - \beta_{1}) * grad \\
+moment\_out = \beta_1 * moment + (1 - \beta_1) * grad \\
-  infNormOut = max(\beta_{2} * infNorm + \epsilon, |grad|) \\
+inf\_norm\_out = max(\beta_2 * inf\_norm + \epsilon, |grad|) \\
-  learningRate = \frac{learningRate}{1 - \beta_{1}^{Beta1Pow}} \\
+learning\_rate = \frac{learning\_rate}{1 - \beta_{1\_pow}} \\
-  paramOut = param - learningRate * \frac{momentOut}{infNormOut}
+param\_out = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out}
 $$
 The original paper does not have an epsilon attribute.
@@ -127,6 +127,6 @@ division by 0 error.
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
-REGISTER_OP_CPU_KERNEL(adamax,
+REGISTER_OP_CPU_KERNEL(
-                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, float>,
+    adamax, ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, float>,
-                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, double>);
+    ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/operators/adamax_op.cu
+++ b/paddle/operators/adamax_op.cu
@@ -16,6 +16,6 @@
 #include "paddle/operators/adamax_op.h"
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(adamax,
+REGISTER_OP_CUDA_KERNEL(
-                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, float>,
+    adamax, ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, float>,
-                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, double>);
+    ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/operators/adamax_op.h
+++ b/paddle/operators/adamax_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdamaxOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -51,14 +51,14 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
    auto inf_norm_out =
        framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-    moment_out.device(place) = beta1 * moment + (1 - beta1) * grad;
+    moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad;
-    inf_norm_out.device(place) =
+    inf_norm_out.device(*place) =
        grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
    auto lr_t = lr / (1 - beta1_pow);
    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-    param_out.device(place) =
+    param_out.device(*place) =
        param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
  }
 };

--- a/paddle/operators/auc_op.h
+++ b/paddle/operators/auc_op.h
@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AucKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -135,7 +135,8 @@ The required data format for this layer is one of the following:
 };
 template <typename T>
-class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
+class BatchNormKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    const float epsilon = ctx.Attr<float>("epsilon");
@@ -318,12 +319,12 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
      PADDLE_THROW("can't find Y@GRAD");
    }
    return framework::OpKernelType(framework::ToDataType(t->type()),
-                                   ctx.device_context());
+                                   ctx.GetPlace());
  }
 };
 template <typename T>
-class BatchNormGradKernel<platform::CPUPlace, T>
+class BatchNormGradKernel<platform::CPUDeviceContext, T>
    : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -436,8 +437,9 @@ class BatchNormGradKernel<platform::CPUPlace, T>
 namespace ops = paddle::operators;
 REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
            batch_norm_grad, ops::BatchNormGradOp);
-REGISTER_OP_CPU_KERNEL(batch_norm,
+REGISTER_OP_CPU_KERNEL(
-                       ops::BatchNormKernel<paddle::platform::CPUPlace, float>);
+    batch_norm,
+    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
    batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::CPUPlace, float>);
+    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>);
--- a/paddle/operators/batch_norm_op.cu.cc
+++ b/paddle/operators/batch_norm_op.cu.cc
@@ -47,7 +47,8 @@ void ExtractNCWHD(const framework::DDim &dims,
 }
 template <typename T>
-class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
+class BatchNormKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
@@ -121,11 +122,12 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
    saved_mean->mutable_data<T>(ctx.GetPlace());
    saved_variance->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<platform::GPUPlace, T> functor;
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    functor(ctx.device_context(), saved_mean, 0);
+    math::SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(ctx.device_context(), saved_variance, 0);
+    functor(dev_ctx, saved_mean, 0);
+    functor(dev_ctx, saved_variance, 0);
-    auto handle = ctx.cuda_device_context().cudnn_handle();
+    auto handle = dev_ctx.cudnn_handle();
    // Now, depending on whether we are running test or not, we have two paths.
    if (is_test) {
@@ -171,7 +173,7 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
 };
 template <typename T>
-class BatchNormGradKernel<platform::GPUPlace, T>
+class BatchNormGradKernel<platform::CUDADeviceContext, T>
    : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -244,11 +246,12 @@ class BatchNormGradKernel<platform::GPUPlace, T>
    const void *saved_mean_data = saved_mean->template data<T>();
    const void *saved_var_data = saved_var->template data<T>();
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
-        ctx.cuda_device_context().cudnn_handle(), mode_,
+        dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-        CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
+        CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-        CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(), data_desc_,
+        CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
-        x->template data<T>(), data_desc_, d_y->template data<T>(), data_desc_,
+        data_desc_, d_y->template data<T>(), data_desc_,
        d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
        scale->template data<T>(),
        d_scale->template mutable_data<T>(ctx.GetPlace()),
@@ -266,8 +269,9 @@ class BatchNormGradKernel<platform::GPUPlace, T>
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(batch_norm,
+REGISTER_OP_CUDA_KERNEL(
-                       ops::BatchNormKernel<paddle::platform::GPUPlace, float>);
+    batch_norm,
-REGISTER_OP_GPU_KERNEL(
+    ops::BatchNormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
    batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::GPUPlace, float>);
+    ops::BatchNormGradKernel<paddle::platform::CUDADeviceContext, float>);
--- a/paddle/operators/batch_norm_op.h
+++ b/paddle/operators/batch_norm_op.h
@@ -34,13 +34,13 @@ inline TensorFormat StringToTensorFormat(const std::string& str) {
  }
 }
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BatchNormKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override;
 };
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BatchNormGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override;

--- a/paddle/operators/bilinear_tensor_product_op.cc
+++ b/paddle/operators/bilinear_tensor_product_op.cc
@@ -159,9 +159,12 @@ REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
            ops::BilinearTensorProductOpGrad);
 REGISTER_OP_CPU_KERNEL(
    bilinear_tensor_product,
-    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, float>,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, double>);
+    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
 REGISTER_OP_CPU_KERNEL(
    bilinear_tensor_product_grad,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, double>);
+                                         float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
+                                         double>);
--- a/paddle/operators/bilinear_tensor_product_op.cu
+++ b/paddle/operators/bilinear_tensor_product_op.cu
@@ -16,11 +16,15 @@ limitations under the License. */
 #include "paddle/operators/bilinear_tensor_product_op.h"
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
    bilinear_tensor_product,
-    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, float>,
+    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
-    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, double>);
+                                     float>,
-REGISTER_OP_GPU_KERNEL(
+    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
+                                     double>);
+REGISTER_OP_CUDA_KERNEL(
    bilinear_tensor_product_grad,
-    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
-    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, double>);
+                                         float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
+                                         double>);
--- a/paddle/operators/bilinear_tensor_product_op.h
+++ b/paddle/operators/bilinear_tensor_product_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BilinearTensorProductKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -46,7 +46,8 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
    int out_dim = weight_dims[0];
    auto x_dim = weight_dims[1];
    auto y_dim = weight_dims[2];
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    // Create the intermediate variable to caculate the result of
    // Input(X) multiplied by Input(Weight_i), the formula is:
@@ -60,9 +61,9 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
      auto output_col_vec = output_mat.chip(i, 1);
      Tensor weight_mat =
          weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
-      math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
+      math::gemm<DeviceContext, T>(dev_ctx, CblasNoTrans, CblasNoTrans,
-                           batch_size, y_dim, x_dim, 1, x->data<T>(),
+                                   batch_size, y_dim, x_dim, 1, x->data<T>(),
-                           weight_mat.data<T>(), 0, left_mul.data<T>());
+                                   weight_mat.data<T>(), 0, left_mul.data<T>());
      output_col_vec.device(place) =
          (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
    }
@@ -74,7 +75,7 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
  }
 };
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -96,8 +97,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
    auto x_mat = EigenMatrix<T>::From(*x);
    auto y_mat = EigenMatrix<T>::From(*y);
    auto d_out_mat = EigenMatrix<T>::From(*d_out);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    // Create the intermediate variable to caculate the Output(Y@Grad).
    Tensor x_scale;
    x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}),
@@ -110,18 +111,18 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
                            ctx.GetPlace());
    auto y_scale_mat = EigenMatrix<T>::From(y_scale);
-    math::SetConstant<Place, T> set_zero;
+    math::SetConstant<DeviceContext, T> set_zero;
    // Set Output(X@Grad) be zero.
    if (d_x) {
      d_x->mutable_data<T>(ctx.GetPlace());
-      set_zero(ctx.device_context(), d_x, static_cast<T>(0));
+      set_zero(dev_ctx, d_x, static_cast<T>(0));
    }
    // Set Output(Y@Grad) be zero.
    if (d_y) {
      d_y->mutable_data<T>(ctx.GetPlace());
-      set_zero(ctx.device_context(), d_y, static_cast<T>(0));
+      set_zero(dev_ctx, d_y, static_cast<T>(0));
    }
    // Caculate the Output(X@Grad) and Output(Y@Grad).
@@ -137,18 +138,18 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                  .broadcast(bcast_for_x) *
              y_mat;
-          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasTrans,
+          math::gemm<DeviceContext, T>(
-                               batch_size, x_dim, y_dim, 1, y_scale.data<T>(),
+              dev_ctx, CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
-                               weight_i.data<T>(), 1, d_x->data<T>());
+              y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
        }
        if (d_y) {
          x_scale_mat.device(place) =
              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                  .broadcast(bcast_for_y) *
              x_mat;
-          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
+          math::gemm<DeviceContext, T>(
-                               batch_size, y_dim, x_dim, 1, x_scale.data<T>(),
+              dev_ctx, CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
-                               weight_i.data<T>(), 1, d_y->data<T>());
+              x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
        }
      }
    }
@@ -165,9 +166,9 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                .broadcast(bcast_for_weight) *
            x_mat;
-        math::gemm<Place, T>(ctx.device_context(), CblasTrans, CblasNoTrans,
+        math::gemm<DeviceContext, T>(dev_ctx, CblasTrans, CblasNoTrans, x_dim,
-                             x_dim, y_dim, batch_size, 1, x_scale.data<T>(),
+                                     y_dim, batch_size, 1, x_scale.data<T>(),
-                             y->data<T>(), 0, d_weight_i.data<T>());
+                                     y->data<T>(), 0, d_weight_i.data<T>());
      }
    }

--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
@@ -68,10 +68,11 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker {
 }  // namespace paddle
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUPlace;
+using CPU = paddle::platform::CPUDeviceContext;
 REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape,
                        ops::CastOpProtoMaker);
 REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
                       ops::CastOpKernel<CPU, double>,
                       ops::CastOpKernel<CPU, int>,
-                       ops::CastOpKernel<CPU, int64_t>);
+                       ops::CastOpKernel<CPU, int64_t>,
+                       ops::CastOpKernel<CPU, bool>);
--- a/paddle/operators/cast_op.cu
+++ b/paddle/operators/cast_op.cu
@@ -16,7 +16,8 @@
 template <typename T>
 using CastOpKernel =
-    paddle::operators::CastOpKernel<paddle::platform::GPUPlace, T>;
+    paddle::operators::CastOpKernel<paddle::platform::CUDADeviceContext, T>;
-REGISTER_OP_GPU_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
+REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
-                       CastOpKernel<int>, CastOpKernel<int64_t>);
+                        CastOpKernel<int>, CastOpKernel<int64_t>,
+                        CastOpKernel<bool>);
--- a/paddle/operators/cast_op.h
+++ b/paddle/operators/cast_op.h
@@ -27,13 +27,13 @@ struct CastOpTransformFunctor {
  HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
 };
-template <typename Place, typename InT>
+template <typename DeviceContext, typename InT>
 struct CastOpFunctor {
  const framework::Tensor* in_;
  framework::Tensor* out_;
-  const platform::DeviceContext& ctx_;
+  const DeviceContext& ctx_;
  CastOpFunctor(const framework::Tensor* in, framework::Tensor* out,
-                const platform::DeviceContext& ctx)
+                const DeviceContext& ctx)
      : in_(in), out_(out), ctx_(ctx) {}
  template <typename OutT>
@@ -42,13 +42,13 @@ struct CastOpFunctor {
    auto numel = in_->numel();
    auto* in_end = in_begin + numel;
    auto* out_begin = out_->mutable_data<OutT>(ctx_.GetPlace());
-    platform::Transform<Place> trans;
+    platform::Transform<DeviceContext> trans;
    trans(ctx_, in_begin, in_end, out_begin,
          CastOpTransformFunctor<InT, OutT>());
  }
 };
-template <typename Place, typename InT>
+template <typename DeviceContext, typename InT>
 class CastOpKernel : public framework::OpKernel<InT> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
@@ -56,7 +56,8 @@ class CastOpKernel : public framework::OpKernel<InT> {
    auto* out = context.Output<framework::Tensor>("Out");
    framework::VisitDataType(
        static_cast<framework::DataType>(context.Attr<int>("out_dtype")),
-        CastOpFunctor<Place, InT>(in, out, context.device_context()));
+        CastOpFunctor<DeviceContext, InT>(
+            in, out, context.template device_context<DeviceContext>()));
  }
 };

--- a/paddle/operators/chunk_eval_op.cc
+++ b/paddle/operators/chunk_eval_op.cc
@@ -32,6 +32,13 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
                   "Output(Recall) of ChunkEvalOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("F1-Score"),
                   "Output(F1-Score) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("NumInferChunks"),
+                   "Output(NumInferChunks) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("NumLabelChunks"),
+                   "Output(NumLabelChunks) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NumCorrectChunks"),
+        "Output(NumCorrectChunks) of ChunkEvalOp should not be null.");
    auto inference_dim = ctx->GetInputDim("Inference");
    auto label_dim = ctx->GetInputDim("Label");
@@ -42,6 +49,9 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Precision", {1});
    ctx->SetOutputDim("Recall", {1});
    ctx->SetOutputDim("F1-Score", {1});
+    ctx->SetOutputDim("NumInferChunks", {1});
+    ctx->SetOutputDim("NumLabelChunks", {1});
+    ctx->SetOutputDim("NumCorrectChunks", {1});
  }
 protected:
@@ -70,6 +80,16 @@ class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
              "sensitivity) of chunks on the given mini-batch.");
    AddOutput("F1-Score",
              "(float). The evaluated F1-Score on the given mini-batch.");
+    AddOutput("NumInferChunks",
+              "(int64_t). The number of chunks in Inference on the given "
+              "mini-batch.");
+    AddOutput(
+        "NumLabelChunks",
+        "(int64_t). The number of chunks in Label on the given mini-batch.");
+    AddOutput(
+        "NumCorrectChunks",
+        "(int64_t). The number of chunks both in Inference and Label on the "
+        "given mini-batch.");
    AddAttr<int>("num_chunk_types",
                 "(int). The number of chunk type. See below for details.");
    AddAttr<std::string>(

--- a/paddle/operators/chunk_eval_op.h
+++ b/paddle/operators/chunk_eval_op.h
@@ -23,7 +23,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ChunkEvalKernel : public framework::OpKernel<T> {
 public:
  struct Segment {
@@ -111,9 +111,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
    std::vector<Segment> label_segments;
    std::vector<Segment> output_segments;
    std::set<int> excluded_chunk_types;
-    int64_t num_output_segments = 0;
-    int64_t num_label_segments = 0;
-    int64_t num_correct = 0;
    if (context.Attr<std::string>("chunk_scheme") == "IOB") {
      num_tag_types = 2;
      tag_begin = 0;
@@ -151,12 +149,24 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
    auto* precision = context.Output<Tensor>("Precision");
    auto* recall = context.Output<Tensor>("Recall");
    auto* f1 = context.Output<Tensor>("F1-Score");
+    auto* num_infer_chunks = context.Output<Tensor>("NumInferChunks");
+    auto* num_label_chunks = context.Output<Tensor>("NumLabelChunks");
+    auto* num_correct_chunks = context.Output<Tensor>("NumCorrectChunks");
    const int64_t* inference_data = inference->data<int64_t>();
    const int64_t* label_data = label->data<int64_t>();
    T* precision_data = precision->mutable_data<T>(context.GetPlace());
    T* racall_data = recall->mutable_data<T>(context.GetPlace());
    T* f1_data = f1->mutable_data<T>(context.GetPlace());
+    int64_t* num_infer_chunks_data =
+        num_infer_chunks->mutable_data<int64_t>(context.GetPlace());
+    int64_t* num_label_chunks_data =
+        num_label_chunks->mutable_data<int64_t>(context.GetPlace());
+    int64_t* num_correct_chunks_data =
+        num_correct_chunks->mutable_data<int64_t>(context.GetPlace());
+    *num_infer_chunks_data = 0;
+    *num_label_chunks_data = 0;
+    *num_correct_chunks_data = 0;
    auto lod = label->lod();
    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
@@ -166,17 +176,23 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
    for (int i = 0; i < num_sequences; ++i) {
      int seq_length = lod[0][i + 1] - lod[0][i];
      EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
-                 output_segments, label_segments, num_output_segments,
+                 output_segments, label_segments, *num_infer_chunks_data,
-                 num_label_segments, num_correct, num_chunk_types,
+                 *num_label_chunks_data, *num_correct_chunks_data,
-                 num_tag_types, other_chunk_type, tag_begin, tag_inside,
+                 num_chunk_types, num_tag_types, other_chunk_type, tag_begin,
-                 tag_end, tag_single, excluded_chunk_types);
+                 tag_inside, tag_end, tag_single, excluded_chunk_types);
    }
-    *precision_data = !num_output_segments ? 0 : static_cast<T>(num_correct) /
+    *precision_data = !(*num_infer_chunks_data)
-                                                     num_output_segments;
+                          ? 0
-    *racall_data = !num_label_segments ? 0 : static_cast<T>(num_correct) /
+                          : static_cast<T>(*num_correct_chunks_data) /
-                                                 num_label_segments;
+                                (*num_infer_chunks_data);
-    *f1_data = !num_correct ? 0 : 2 * (*precision_data) * (*racall_data) /
+    *racall_data = !(*num_label_chunks_data)
-                                      ((*precision_data) + (*racall_data));
+                       ? 0
+                       : static_cast<T>(*num_correct_chunks_data) /
+                             (*num_label_chunks_data);
+    *f1_data = !(*num_correct_chunks_data)
+                   ? 0
+                   : 2 * (*precision_data) * (*racall_data) /
+                         ((*precision_data) + (*racall_data));
  }
  void EvalOneSeq(const int64_t* output, const int64_t* label, int length,

--- a/paddle/operators/clip_by_norm_op.cc
+++ b/paddle/operators/clip_by_norm_op.cc
@@ -71,4 +71,5 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp,
                             ops::ClipByNormOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    clip_by_norm, ops::ClipByNormKernel<paddle::platform::CPUPlace, float>);
+    clip_by_norm,
+    ops::ClipByNormKernel<paddle::platform::CPUDeviceContext, float>);
--- a/paddle/operators/clip_by_norm_op.cu
+++ b/paddle/operators/clip_by_norm_op.cu
@@ -15,5 +15,6 @@
 #include "paddle/operators/clip_by_norm_op.h"
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
-    clip_by_norm, ops::ClipByNormKernel<paddle::platform::GPUPlace, float>);
+    clip_by_norm,
+    ops::ClipByNormKernel<paddle::platform::CUDADeviceContext, float>);
--- a/paddle/operators/clip_by_norm_op.h
+++ b/paddle/operators/clip_by_norm_op.h
@@ -26,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ClipByNormKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
@@ -38,7 +38,8 @@ class ClipByNormKernel : public framework::OpKernel<T> {
    auto x = EigenVector<T>::Flatten(*input);
    auto out = EigenVector<T>::Flatten(*output);
    auto x_norm = x.square().sum().sqrt();
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
    auto temp = (x_norm <= max_norm).template cast<T>().eval();
    auto scaling = temp + (static_cast<T>(1) - temp) * max_norm / x_norm;

--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -83,7 +83,7 @@ class ClipOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
            ops::ClipOpGrad);
-REGISTER_OP_CPU_KERNEL(clip,
+REGISTER_OP_CPU_KERNEL(
-                       ops::ClipKernel<paddle::platform::CPUPlace, float>);
+    clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(clip_grad,
+REGISTER_OP_CPU_KERNEL(
-                       ops::ClipGradKernel<paddle::platform::CPUPlace, float>);
+    clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>);
--- a/paddle/operators/clip_op.cu
+++ b/paddle/operators/clip_op.cu
@@ -15,7 +15,7 @@
 #include "paddle/operators/clip_op.h"
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(clip,
+REGISTER_OP_CUDA_KERNEL(
-                       ops::ClipKernel<paddle::platform::GPUPlace, float>);
+    clip, ops::ClipKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_GPU_KERNEL(clip_grad,
+REGISTER_OP_CUDA_KERNEL(
-                       ops::ClipGradKernel<paddle::platform::GPUPlace, float>);
+    clip_grad, ops::ClipGradKernel<paddle::platform::CUDADeviceContext, float>);
--- a/paddle/operators/clip_op.h
+++ b/paddle/operators/clip_op.h
@@ -55,7 +55,7 @@ class ClipGradFunctor {
  T max_;
 };
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ClipKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
@@ -66,13 +66,13 @@ class ClipKernel : public framework::OpKernel<T> {
    T* out_data = out->mutable_data<T>(context.GetPlace());
    const T* x_data = x->data<T>();
    int64_t numel = x->numel();
-    Transform<Place> trans;
+    Transform<DeviceContext> trans;
-    trans(context.device_context(), x_data, x_data + numel, out_data,
+    trans(context.template device_context<DeviceContext>(), x_data,
-          ClipFunctor<T>(min, max));
+          x_data + numel, out_data, ClipFunctor<T>(min, max));
  }
 };
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ClipGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
@@ -86,9 +86,9 @@ class ClipGradKernel : public framework::OpKernel<T> {
      auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
      const T* d_out_data = d_out->data<T>();
      const T* x_data = x->data<T>();
-      Transform<Place> trans;
+      Transform<DeviceContext> trans;
-      trans(context.device_context(), d_out_data, d_out_data + numel, x_data,
+      trans(context.template device_context<DeviceContext>(), d_out_data,
-            d_x_data, ClipGradFunctor<T>(min, max));
+            d_out_data + numel, x_data, d_x_data, ClipGradFunctor<T>(min, max));
    }
  }
 };

--- a/paddle/operators/compare_op.cu
+++ b/paddle/operators/compare_op.cu
@@ -14,10 +14,10 @@
 #include "paddle/operators/compare_op.h"
-REGISTER_LOGICAL_KERNEL(less_than, GPU, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
-REGISTER_LOGICAL_KERNEL(less_equal, GPU, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
-REGISTER_LOGICAL_KERNEL(greater_than, GPU,
+REGISTER_LOGICAL_KERNEL(greater_than, CUDA,
                        paddle::operators::GreaterThanFunctor);
-REGISTER_LOGICAL_KERNEL(greater_equal, GPU,
+REGISTER_LOGICAL_KERNEL(greater_equal, CUDA,
                        paddle::operators::GreaterEqualFunctor);
-REGISTER_LOGICAL_KERNEL(equal, GPU, paddle::operators::EqualFunctor);
+REGISTER_LOGICAL_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
@@ -59,7 +59,7 @@ struct EqualFunctor {
  }
 };
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class CompareOpKernel
    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
 public:
@@ -69,24 +69,23 @@ class CompareOpKernel
    auto* y = context.Input<framework::Tensor>("Y");
    auto* out = context.Output<framework::Tensor>("Out");
    Functor binary_func;
-    platform::Transform<Place> trans;
+    platform::Transform<DeviceContext> trans;
-    trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
+    trans(context.template device_context<DeviceContext>(), x->data<T>(),
-          y->data<T>(), out->mutable_data<bool>(context.GetPlace()),
+          x->data<T>() + x->numel(), y->data<T>(),
-          binary_func);
+          out->mutable_data<bool>(context.GetPlace()), binary_func);
  }
 };
 }  // namespace operators
 }  // namespace paddle
-#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor)                     \
+#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor)                    \
-  REGISTER_OP_##dev##_KERNEL(                                              \
+  REGISTER_OP_##dev##_KERNEL(                                             \
-      op_type,                                                             \
+      op_type, ::paddle::operators::CompareOpKernel<                      \
-      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
+                   ::paddle::platform::dev##DeviceContext, functor<int>>, \
-                                           functor<int>>,                  \
+      ::paddle::operators::CompareOpKernel<                               \
-      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
+          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,      \
-                                           functor<int64_t>>,              \
+      ::paddle::operators::CompareOpKernel<                               \
-      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
+          ::paddle::platform::dev##DeviceContext, functor<float>>,        \
-                                           functor<float>>,                \
+      ::paddle::operators::CompareOpKernel<                               \
-      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
+          ::paddle::platform::dev##DeviceContext, functor<double>>);
-                                           functor<double>>);
--- a/paddle/operators/concat_op.cu.cc
+++ b/paddle/operators/concat_op.cu.cc
--- a/paddle/operators/concat_op.h
+++ b/paddle/operators/concat_op.h
--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
--- a/paddle/operators/conv_shift_op.cu
+++ b/paddle/operators/conv_shift_op.cu
--- a/paddle/operators/conv_shift_op.h
+++ b/paddle/operators/conv_shift_op.h
--- a/paddle/operators/conv_transpose_cudnn_op.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cc
--- a/paddle/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
--- a/paddle/operators/conv_transpose_op.cu.cc
+++ b/paddle/operators/conv_transpose_op.cu.cc
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
--- a/paddle/operators/cos_sim_op.cu
+++ b/paddle/operators/cos_sim_op.cu
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
--- a/paddle/operators/crf_decoding_op.h
+++ b/paddle/operators/crf_decoding_op.h
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
--- a/paddle/operators/crop_op.cu
+++ b/paddle/operators/crop_op.cu
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
--- a/paddle/operators/decayed_adagrad_op.cc
+++ b/paddle/operators/decayed_adagrad_op.cc
--- a/paddle/operators/decayed_adagrad_op.cu
+++ b/paddle/operators/decayed_adagrad_op.cu
--- a/paddle/operators/decayed_adagrad_op.h
+++ b/paddle/operators/decayed_adagrad_op.h
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/operators/elementwise_add_op.cc
--- a/paddle/operators/elementwise_add_op.cu
+++ b/paddle/operators/elementwise_add_op.cu
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
--- a/paddle/operators/elementwise_div_op.cc
+++ b/paddle/operators/elementwise_div_op.cc
--- a/paddle/operators/elementwise_div_op.cu
+++ b/paddle/operators/elementwise_div_op.cu
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/operators/elementwise_sub_op.cc
--- a/paddle/operators/elementwise_sub_op.cu
+++ b/paddle/operators/elementwise_sub_op.cu
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
--- a/paddle/operators/expand_op.cc
+++ b/paddle/operators/expand_op.cc
--- a/paddle/operators/expand_op.cu
+++ b/paddle/operators/expand_op.cu
--- a/paddle/operators/expand_op.h
+++ b/paddle/operators/expand_op.h
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
--- a/paddle/operators/fill_constant_batch_size_like_op.cu.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
--- a/paddle/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/operators/fill_constant_batch_size_like_op.h
--- a/paddle/operators/fill_op.cc
+++ b/paddle/operators/fill_op.cc
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
--- a/paddle/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/operators/fill_zeros_like_op.cu.cc
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
--- a/paddle/operators/ftrl_op.cc
+++ b/paddle/operators/ftrl_op.cc
--- a/paddle/operators/ftrl_op.cu
+++ b/paddle/operators/ftrl_op.cu
--- a/paddle/operators/ftrl_op.h
+++ b/paddle/operators/ftrl_op.h
--- a/paddle/operators/gather.cu.h
+++ b/paddle/operators/gather.cu.h
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
--- a/paddle/operators/gru_op.cc
+++ b/paddle/operators/gru_op.cc
--- a/paddle/operators/gru_op.cu.cc
+++ b/paddle/operators/gru_op.cu.cc
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
--- a/paddle/operators/gru_unit_op.cu
+++ b/paddle/operators/gru_unit_op.cu
--- a/paddle/operators/gru_unit_op.h
+++ b/paddle/operators/gru_unit_op.h
--- a/paddle/operators/hinge_loss_op.cc
+++ b/paddle/operators/hinge_loss_op.cc
--- a/paddle/operators/hinge_loss_op.cu
+++ b/paddle/operators/hinge_loss_op.cu
--- a/paddle/operators/hinge_loss_op.h
+++ b/paddle/operators/hinge_loss_op.h
--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/operators/huber_loss_op.cc
--- a/paddle/operators/huber_loss_op.cu
+++ b/paddle/operators/huber_loss_op.cu
--- a/paddle/operators/huber_loss_op.h
+++ b/paddle/operators/huber_loss_op.h
--- a/paddle/operators/l1_norm_op.cc
+++ b/paddle/operators/l1_norm_op.cc
--- a/paddle/operators/l1_norm_op.cu
+++ b/paddle/operators/l1_norm_op.cu
--- a/paddle/operators/l1_norm_op.h
+++ b/paddle/operators/l1_norm_op.h
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
--- a/paddle/operators/linear_chain_crf_op.cu
+++ b/paddle/operators/linear_chain_crf_op.cu
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
--- a/paddle/operators/lod_reset_op.cu
+++ b/paddle/operators/lod_reset_op.cu
--- a/paddle/operators/lod_reset_op.h
+++ b/paddle/operators/lod_reset_op.h
--- a/paddle/operators/log_loss_op.cc
+++ b/paddle/operators/log_loss_op.cc
--- a/paddle/operators/log_loss_op.cu
+++ b/paddle/operators/log_loss_op.cu
--- a/paddle/operators/log_loss_op.h
+++ b/paddle/operators/log_loss_op.h
--- a/paddle/operators/logical_op.cu
+++ b/paddle/operators/logical_op.cu
--- a/paddle/operators/logical_op.h
+++ b/paddle/operators/logical_op.h
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
--- a/paddle/operators/lrn_op.cc
+++ b/paddle/operators/lrn_op.cc
--- a/paddle/operators/lrn_op.cu
+++ b/paddle/operators/lrn_op.cu
--- a/paddle/operators/lrn_op.h
+++ b/paddle/operators/lrn_op.h
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
--- a/paddle/operators/lstm_op.cu.cc
+++ b/paddle/operators/lstm_op.cu.cc
--- a/paddle/operators/lstm_op.h
+++ b/paddle/operators/lstm_op.h
--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/operators/lstm_unit_op.cu
--- a/paddle/operators/lstm_unit_op.h
+++ b/paddle/operators/lstm_unit_op.h
--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
--- a/paddle/operators/margin_rank_loss_op.cu
+++ b/paddle/operators/margin_rank_loss_op.cu
--- a/paddle/operators/margin_rank_loss_op.h
+++ b/paddle/operators/margin_rank_loss_op.h
--- a/paddle/operators/math/context_project.cc
+++ b/paddle/operators/math/context_project.cc
--- a/paddle/operators/math/context_project.cu
+++ b/paddle/operators/math/context_project.cu
--- a/paddle/operators/math/context_project.h
+++ b/paddle/operators/math/context_project.h
--- a/paddle/operators/math/cross_entropy.cc
+++ b/paddle/operators/math/cross_entropy.cc
--- a/paddle/operators/math/cross_entropy.cu
+++ b/paddle/operators/math/cross_entropy.cu
--- a/paddle/operators/math/cross_entropy.h
+++ b/paddle/operators/math/cross_entropy.h
--- a/paddle/operators/math/gru_compute.cc
+++ b/paddle/operators/math/gru_compute.cc
--- a/paddle/operators/math/gru_compute.cu
+++ b/paddle/operators/math/gru_compute.cu
--- a/paddle/operators/math/gru_compute.h
+++ b/paddle/operators/math/gru_compute.h
--- a/paddle/operators/math/im2col.cc
+++ b/paddle/operators/math/im2col.cc
--- a/paddle/operators/math/im2col.cu
+++ b/paddle/operators/math/im2col.cu
--- a/paddle/operators/math/im2col.h
+++ b/paddle/operators/math/im2col.h
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
--- a/paddle/operators/math/lstm_compute.cc
+++ b/paddle/operators/math/lstm_compute.cc
--- a/paddle/operators/math/lstm_compute.cu
+++ b/paddle/operators/math/lstm_compute.cu
--- a/paddle/operators/math/lstm_compute.h
+++ b/paddle/operators/math/lstm_compute.h
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
--- a/paddle/operators/math/math_function_test.cu
+++ b/paddle/operators/math/math_function_test.cu
--- a/paddle/operators/math/matmul.h
+++ b/paddle/operators/math/matmul.h
--- a/paddle/operators/math/maxouting.cc
+++ b/paddle/operators/math/maxouting.cc
--- a/paddle/operators/math/maxouting.cu
+++ b/paddle/operators/math/maxouting.cu
--- a/paddle/operators/math/maxouting.h
+++ b/paddle/operators/math/maxouting.h
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
--- a/paddle/operators/math/selected_rows_functor.cc
+++ b/paddle/operators/math/selected_rows_functor.cc
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
--- a/paddle/operators/math/selected_rows_functor.h
+++ b/paddle/operators/math/selected_rows_functor.h
--- a/paddle/operators/math/selected_rows_functor_test.cc
+++ b/paddle/operators/math/selected_rows_functor_test.cc
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/operators/math/selected_rows_functor_test.cu
--- a/paddle/operators/math/sequence2batch.cc
+++ b/paddle/operators/math/sequence2batch.cc
--- a/paddle/operators/math/sequence2batch.cu
+++ b/paddle/operators/math/sequence2batch.cu
--- a/paddle/operators/math/sequence2batch.h
+++ b/paddle/operators/math/sequence2batch.h
--- a/paddle/operators/math/sequence_pooling.cc
+++ b/paddle/operators/math/sequence_pooling.cc
--- a/paddle/operators/math/sequence_pooling.cu
+++ b/paddle/operators/math/sequence_pooling.cu
--- a/paddle/operators/math/sequence_pooling.h
+++ b/paddle/operators/math/sequence_pooling.h
--- a/paddle/operators/math/softmax.cc
+++ b/paddle/operators/math/softmax.cc
--- a/paddle/operators/math/softmax.cu
+++ b/paddle/operators/math/softmax.cu
--- a/paddle/operators/math/softmax.h
+++ b/paddle/operators/math/softmax.h
--- a/paddle/operators/math/softmax_impl.h
+++ b/paddle/operators/math/softmax_impl.h
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
--- a/paddle/operators/math/vol2col.cc
+++ b/paddle/operators/math/vol2col.cc
--- a/paddle/operators/math/vol2col.cu
+++ b/paddle/operators/math/vol2col.cu
--- a/paddle/operators/math/vol2col.h
+++ b/paddle/operators/math/vol2col.h
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
--- a/paddle/operators/matmul_op.cu.cc
+++ b/paddle/operators/matmul_op.cu.cc
--- a/paddle/operators/matmul_op.h
+++ b/paddle/operators/matmul_op.h
--- a/paddle/operators/maxout_op.cc
+++ b/paddle/operators/maxout_op.cc
--- a/paddle/operators/maxout_op.cu.cc
+++ b/paddle/operators/maxout_op.cu.cc
--- a/paddle/operators/maxout_op.h
+++ b/paddle/operators/maxout_op.h
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
--- a/paddle/operators/mean_op.cu
+++ b/paddle/operators/mean_op.cu
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
--- a/paddle/operators/minus_op.cu
+++ b/paddle/operators/minus_op.cu
--- a/paddle/operators/minus_op.h
+++ b/paddle/operators/minus_op.h
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
--- a/paddle/operators/modified_huber_loss_op.cu
+++ b/paddle/operators/modified_huber_loss_op.cu
--- a/paddle/operators/modified_huber_loss_op.h
+++ b/paddle/operators/modified_huber_loss_op.h
--- a/paddle/operators/momentum_op.cu
+++ b/paddle/operators/momentum_op.cu
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
--- a/paddle/operators/mul_op.cu.cc
+++ b/paddle/operators/mul_op.cu.cc
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
--- a/paddle/operators/multiplex_op.h
+++ b/paddle/operators/multiplex_op.h
--- a/paddle/operators/nccl_op.cu.cc
+++ b/paddle/operators/nccl_op.cu.cc
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
--- a/paddle/operators/nce_op.cc
+++ b/paddle/operators/nce_op.cc
--- a/paddle/operators/nce_op.h
+++ b/paddle/operators/nce_op.h
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
--- a/paddle/operators/pad_op.cu
+++ b/paddle/operators/pad_op.cu
--- a/paddle/operators/pad_op.h
+++ b/paddle/operators/pad_op.h
--- a/paddle/operators/pool_cudnn_op.cc
+++ b/paddle/operators/pool_cudnn_op.cc
--- a/paddle/operators/pool_cudnn_op.cu.cc
+++ b/paddle/operators/pool_cudnn_op.cu.cc
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
--- a/paddle/operators/pool_op.cu.cc
+++ b/paddle/operators/pool_op.cu.cc
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
--- a/paddle/operators/pool_with_index_op.cu.cc
+++ b/paddle/operators/pool_with_index_op.cu.cc
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
--- a/paddle/operators/positive_negative_pair_op.h
+++ b/paddle/operators/positive_negative_pair_op.h
--- a/paddle/operators/precision_recall_op.h
+++ b/paddle/operators/precision_recall_op.h
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
--- a/paddle/operators/prelu_op.cu
+++ b/paddle/operators/prelu_op.cu
--- a/paddle/operators/prelu_op.h
+++ b/paddle/operators/prelu_op.h
--- a/paddle/operators/proximal_adagrad_op.cc
+++ b/paddle/operators/proximal_adagrad_op.cc
--- a/paddle/operators/proximal_adagrad_op.cu
+++ b/paddle/operators/proximal_adagrad_op.cu
--- a/paddle/operators/proximal_adagrad_op.h
+++ b/paddle/operators/proximal_adagrad_op.h
--- a/paddle/operators/proximal_gd_op.cc
+++ b/paddle/operators/proximal_gd_op.cc
--- a/paddle/operators/proximal_gd_op.cu
+++ b/paddle/operators/proximal_gd_op.cu
--- a/paddle/operators/proximal_gd_op.h
+++ b/paddle/operators/proximal_gd_op.h
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
--- a/paddle/operators/rank_loss_op.cu
+++ b/paddle/operators/rank_loss_op.cu
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
--- a/paddle/operators/reduce_op.cu
+++ b/paddle/operators/reduce_op.cu
--- a/paddle/operators/reduce_op.h
+++ b/paddle/operators/reduce_op.h
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
--- a/paddle/operators/reshape_op.cu
+++ b/paddle/operators/reshape_op.cu
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
--- a/paddle/operators/rmsprop_op.cu
+++ b/paddle/operators/rmsprop_op.cu
--- a/paddle/operators/rmsprop_op.h
+++ b/paddle/operators/rmsprop_op.h
--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
--- a/paddle/operators/roi_pool_op.cu
+++ b/paddle/operators/roi_pool_op.cu
--- a/paddle/operators/roi_pool_op.h
+++ b/paddle/operators/roi_pool_op.h
--- a/paddle/operators/row_conv_op.cc
+++ b/paddle/operators/row_conv_op.cc
--- a/paddle/operators/row_conv_op.cu
+++ b/paddle/operators/row_conv_op.cu
--- a/paddle/operators/row_conv_op.h
+++ b/paddle/operators/row_conv_op.h
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
--- a/paddle/operators/send_recv_op_test.cc
+++ b/paddle/operators/send_recv_op_test.cc
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
--- a/paddle/operators/seq_expand_op.cu
+++ b/paddle/operators/seq_expand_op.cu
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/seq_expand_op.h
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
--- a/paddle/operators/sequence_concat_op.cu.cc
+++ b/paddle/operators/sequence_concat_op.cu.cc
--- a/paddle/operators/sequence_concat_op.h
+++ b/paddle/operators/sequence_concat_op.h
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
--- a/paddle/operators/sequence_conv_op.cu.cc
+++ b/paddle/operators/sequence_conv_op.cu.cc
--- a/paddle/operators/sequence_conv_op.h
+++ b/paddle/operators/sequence_conv_op.h
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
--- a/paddle/operators/sequence_pool_op.cu
+++ b/paddle/operators/sequence_pool_op.cu
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
--- a/paddle/operators/sequence_slice_op.cu
+++ b/paddle/operators/sequence_slice_op.cu
--- a/paddle/operators/sequence_slice_op.h
+++ b/paddle/operators/sequence_slice_op.h
--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
--- a/paddle/operators/sequence_softmax_op.cu.cc
+++ b/paddle/operators/sequence_softmax_op.cu.cc
--- a/paddle/operators/sequence_softmax_op.h
+++ b/paddle/operators/sequence_softmax_op.h
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
--- a/paddle/operators/sign_op.cc
+++ b/paddle/operators/sign_op.cc
--- a/paddle/operators/sign_op.cu
+++ b/paddle/operators/sign_op.cu
--- a/paddle/operators/sign_op.h
+++ b/paddle/operators/sign_op.h
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
--- a/paddle/operators/smooth_l1_loss_op.cu
+++ b/paddle/operators/smooth_l1_loss_op.cu
--- a/paddle/operators/smooth_l1_loss_op.h
+++ b/paddle/operators/smooth_l1_loss_op.h
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
--- a/paddle/operators/softmax_op.cu.cc
+++ b/paddle/operators/softmax_op.cu.cc
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
--- a/paddle/operators/split_op.cu.cc
+++ b/paddle/operators/split_op.cu.cc
--- a/paddle/operators/split_op.h
+++ b/paddle/operators/split_op.h
--- a/paddle/operators/spp_op.cc
+++ b/paddle/operators/spp_op.cc
--- a/paddle/operators/spp_op.cu.cc
+++ b/paddle/operators/spp_op.cu.cc
--- a/paddle/operators/spp_op.h
+++ b/paddle/operators/spp_op.h
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
--- a/paddle/operators/squared_l2_distance_op.cu
+++ b/paddle/operators/squared_l2_distance_op.cu
--- a/paddle/operators/squared_l2_distance_op.h
+++ b/paddle/operators/squared_l2_distance_op.h
--- a/paddle/operators/squared_l2_norm_op.cc
+++ b/paddle/operators/squared_l2_norm_op.cc
--- a/paddle/operators/squared_l2_norm_op.cu
+++ b/paddle/operators/squared_l2_norm_op.cu
--- a/paddle/operators/squared_l2_norm_op.h
+++ b/paddle/operators/squared_l2_norm_op.h
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
--- a/paddle/operators/top_k_op.h
+++ b/paddle/operators/top_k_op.h
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
--- a/paddle/operators/transpose_op.cu.cc
+++ b/paddle/operators/transpose_op.cu.cc
--- a/paddle/operators/transpose_op.h
+++ b/paddle/operators/transpose_op.h
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
--- a/paddle/operators/unpool_op.cu.cc
+++ b/paddle/operators/unpool_op.cu.cc
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
--- a/paddle/platform/dynload/nccl.cc
+++ b/paddle/platform/dynload/nccl.cc
--- a/paddle/platform/dynload/nccl.h
+++ b/paddle/platform/dynload/nccl.h
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
--- a/paddle/platform/place_test.cc
+++ b/paddle/platform/place_test.cc
--- a/paddle/platform/transform.h
+++ b/paddle/platform/transform.h
--- a/paddle/platform/transform_test.cu
+++ b/paddle/platform/transform_test.cu
--- a/paddle/platform/variant.h
+++ b/paddle/platform/variant.h
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
--- a/paddle/scripts/tools/build_docs/build_docs.sh
+++ b/paddle/scripts/tools/build_docs/build_docs.sh
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
--- a/python/.gitignore
+++ b/python/.gitignore
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
--- a/python/paddle/v2/fluid/layers/__init__.py
+++ b/python/paddle/v2/fluid/layers/__init__.py
--- a/python/paddle/v2/fluid/layers.py
+++ b/python/paddle/v2/fluid/layers.py
--- a/python/paddle/v2/fluid/layers/io.py
+++ b/python/paddle/v2/fluid/layers/io.py
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
--- a/python/paddle/v2/fluid/param_attr.py
+++ b/python/paddle/v2/fluid/param_attr.py
--- a/python/paddle/v2/fluid/registry.py
+++ b/python/paddle/v2/fluid/registry.py
--- a/python/paddle/v2/fluid/regularizer.py
+++ b/python/paddle/v2/fluid/regularizer.py
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
--- a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
+++ b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
--- a/python/paddle/v2/fluid/tests/test_fill_op.py
+++ b/python/paddle/v2/fluid/tests/test_fill_op.py
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
--- a/python/paddle/v2/fluid/tests/test_reduce_op.py
+++ b/python/paddle/v2/fluid/tests/test_reduce_op.py
--- a/python/paddle/v2/fluid/tests/test_registry.py
+++ b/python/paddle/v2/fluid/tests/test_registry.py
--- a/python/paddle/v2/fluid/tests/test_reshape_op.py
+++ b/python/paddle/v2/fluid/tests/test_reshape_op.py
--- a/python/paddle/v2/fluid/tests/test_row_conv_op.py
+++ b/python/paddle/v2/fluid/tests/test_row_conv_op.py
--- a/python/paddle/v2/fluid/tests/test_spp_op.py
+++ b/python/paddle/v2/fluid/tests/test_spp_op.py
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ b/python/paddle/v2/reader/tests/decorator_test.py
--- a/python/setup.py.in
+++ b/python/setup.py.in