diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5df83499d5dde29b205ee17fba81a63c9a643235..00996cb7ed5cc573c42b69be6db369c3654d6d1a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,8 +20,10 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 include(system)
 
 project(paddle CXX C Go)
-message(STATUS "CXX compiler: " ${CMAKE_CXX_COMPILER} ", version: " ${CMAKE_CXX_COMPILER_VERSION})
-message(STATUS "C compiler: " ${CMAKE_C_COMPILER} ", version: " ${CMAKE_C_COMPILER_VERSION})
+message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
+        "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
+message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
+        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 
 find_package(Sphinx)
 if(NOT CMAKE_CROSSCOMPILING)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 96fc886a342cae38d5b804266d3af7bc909a4da2..c4712f19eb80b34ffbf713d2b13fc0c775312af1 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -19,7 +19,7 @@ ExternalProject_Add(
 
 if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
     set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy_eigen3 = \"${dummyfile}\";")
+    file(WRITE ${dummyfile} "const char *dummy_eigen3 = \"${dummyfile}\";")
     add_library(eigen3 STATIC ${dummyfile})
 else()
     add_library(eigen3 INTERFACE)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 5d24caebdcc5a28823164d718fb1628be5c4179d..89fc34796a03ff3f3e5b022ae10b2646832b1ac7 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -63,9 +63,30 @@ ExternalProject_Add(
                         -DMKLROOT:PATH=${MKLML_ROOT}
 )
 
-ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
-ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
+ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
+ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
 MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
 add_definitions(-DPADDLE_WITH_MKLDNN)
-LIST(APPEND external_project_dependencies mkldnn)
+LIST(APPEND external_project_dependencies shared_mkldnn)
+
+# generate a static dummy target to track mkldnn dependencies
+# for cc_library(xxx SRCS xxx.c DEPS mkldnn)
+SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mkldnn_dummy.c)
+FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+ADD_LIBRARY(mkldnn STATIC ${dummyfile})
+TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_LIB} ${MKLML_IOMP_LIB})
+ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
+
+# copy the real so.0 lib to install dir
+# it can be directly contained in wheel or capi
+SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
+ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
+    COMMAND cp ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
+    DEPENDS mkldnn)
+ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
+
+IF(WITH_C_API)
+  INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
+ENDIF()
+
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 20dbc32a738d982df2d3f035206279c82c8de264..15a07ea3daf5aa606235f20288a8306966334a1a 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -66,3 +66,7 @@ ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
 ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
 LIST(APPEND external_project_dependencies mklml)
+
+IF(WITH_C_API)
+  INSTALL(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib)
+ENDIF()
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 97857a686b38d935b19f510ecdcb66bcca91fe03..0e79c0cc7992060cbe3b668ec927936183389eb6 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -30,23 +30,21 @@ IF(NOT ${CBLAS_FOUND})
         CACHE FILEPATH "openblas library." FORCE)
 
     SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
+    SET(OPENBLAS_COMMIT "v0.2.20")
 
     IF(CMAKE_CROSSCOMPILING)
         SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
         GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
         SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
         IF(ANDROID)
-            # arm_soft_fp_abi branch of OpenBLAS to support softfp
-            #   https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
-            SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
             IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+                # use softfp
                 SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
             ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
                 SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
             ENDIF()
         ELSEIF(IOS)
             IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
-                SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
                 SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
                 SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
                 SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
@@ -56,14 +54,12 @@ IF(NOT ${CBLAS_FOUND})
             ENDIF()
         ELSEIF(RPI)
             # use hardfp
-            SET(OPENBLAS_COMMIT "v0.2.20")
             SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
         ENDIF()
     ELSE()
         IF(APPLE)
             SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
         ENDIF()
-        SET(OPENBLAS_COMMIT "v0.2.20")
         SET(OPTIONAL_ARGS "")
         IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
             SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
@@ -113,7 +109,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 # FIXME(gangliao): generate cblas target to track all high performance
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
-FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
 ADD_LIBRARY(cblas STATIC ${dummyfile})
 TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
 
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 66c8e3ad7ef7c80c1f388c25983425a0db5c0220..585db019d521b1699baadfae31ef95b5059c71b4 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -120,7 +120,7 @@ function(merge_static_libs TARGET_NAME)
       DEPENDS ${libs})
 
     # Generate dummy staic lib
-    file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
+    file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
     add_library(${TARGET_NAME} STATIC ${target_SRCS})
     target_link_libraries(${TARGET_NAME} ${libs_deps})
 
@@ -160,7 +160,7 @@ function(merge_static_libs TARGET_NAME)
       DEPENDS ${libs} ${target_OBJS})
 
     # Generate dummy staic lib
-    file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
+    file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
     add_library(${TARGET_NAME} STATIC ${target_SRCS})
     target_link_libraries(${TARGET_NAME} ${libs_deps})
 
@@ -324,7 +324,7 @@ function(go_library TARGET_NAME)
     )
 
   # Add dummy code to support `make target_name` under Terminal Command
-  file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+  file(WRITE ${dummyfile} "const char *dummy_${TARGET_NAME} = \"${dummyfile}\";")
   if (go_library_SHARED OR go_library_shared)
     add_library(${TARGET_NAME} SHARED ${dummyfile})
   else()
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index 004ee2d8c85ce7661886179570e693d7d61bc6d8..a7c8670f66cc7f319e41155211ead2d89126117f 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -307,6 +307,12 @@ sequence_expand
     :noindex:
 
 
+gru_unit
+--------
+..  autofunction:: paddle.v2.fluid.layers.gru_unit
+    :noindex:
+
+
 lstm_unit
 ---------
 ..  autofunction:: paddle.v2.fluid.layers.lstm_unit
diff --git a/doc/design/ci_build_whl.png b/doc/design/ci_build_whl.png
new file mode 100644
index 0000000000000000000000000000000000000000..232762b82a9ae3e979a1f38a7beb715c87438f40
Binary files /dev/null and b/doc/design/ci_build_whl.png differ
diff --git a/doc/design/concurrent_programming.md b/doc/design/concurrent_programming.md
new file mode 100644
index 0000000000000000000000000000000000000000..afc65e831d58ff427663806e56294292ccbef85b
--- /dev/null
+++ b/doc/design/concurrent_programming.md
@@ -0,0 +1,163 @@
+# Design Doc: Concurrent Programming with Fluid
+
+With PaddlePaddle Fluid, users describe a program other than a model.  The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.   
+
+Many know that when we program TensorFlow, we can specify the device on which each operator runs.  This allows us to create a concurrent/parallel AI application.   An interesting questions is **how does a `ProgramDesc` represents a concurrent program?**  
+
+The answer relies on the fact that a `ProgramDesc` is similar to an abstract syntax tree (AST) that describes a program.  So users just program a concurrent program that they do with any concurrent programming language, e.g., [Go](https://golang.org).
+
+## An Analogy
+
+The following table compares concepts in Fluid and Go
+
+| Go | Fluid |
+|----|-------|
+|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid) |
+| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
+| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
+| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |
+
+## An Example Concurrent Program
+
+To review all above concepts in an example, let us take a simple program and writes its distributed version.
+
+Suppose that we want to parallelize a naive Fluid program (written in Go and calling Fluid's Go binding) that multiplies two tensors.
+
+```go
+import "fluid"
+
+func paddlepaddle() {
+  X = fluid.read(...)
+  W = fluid.Tensor(...)
+  Y = fluid.mult(X, W)
+}
+```
+
+Please be aware that the Fluid's Go binding provides the default `main` function, which calls the `paddlepaddle` function, which, in this case, is defined in above program and creates the following `ProgramDesc` message.
+
+```protobuf
+message ProgramDesc {
+  block[0] = Block {
+    vars = [X, W, Y],
+    ops = [
+      read(output = X)
+      assign(input = ..., output = W)
+      mult(input = {X, W}, output = Y)
+    ],
+  }
+}
+```
+
+Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
+
+The default `main` function is defined as follows:
+
+```go
+func main() {
+  paddlepaddle()
+  fluid.run()
+}
+```
+
+## The Concurrent Version
+
+By parallelizing the above program, we could support very big tensor X by splitting into small pieces {x_1, x_2, ...} and sent each piece to worker process/node for parallel multiplication.
+
+In this case, we can write a transpiler that takes a `ProgramDesc` message that represents the above example program and outputs two `ProgramDesc` messages, one for running on the master process/node, and the other one for worker processes/nodes.
+
+### The Master Program
+
+The master program could look like the following:
+
+```protobuf
+message ProgramDesc {
+  block[0] = Block {
+    vars = [X, L, Y],
+    ops = [
+      read(output = X)
+      kube_get_workers_addrs(output = L)
+      Y = tensor_array(len(L))
+      parallel_for(input = X, output = Y, 
+                   attrs = {L, block_id(1)}) # referring to block 1
+    ]
+  }
+  
+  block[1] = Block {
+    parent = 0,
+    vars = [x, y, index],
+    ops = [
+      slice(input = [X, index], output = x) # index is initialized by parallel_for
+      send(input = x, attrs = L[index])
+      recv(outputs = y, attrs = L[index])
+      assign(input = y, output = Y[index])
+    ]
+  }
+}
+```
+
+The equivalent Fluid program (calling the Go binding) is:
+
+```go
+func main() {  //// block 0
+  X = fluid.read(...)
+  L = fluid.k8s.get_worker_addrs()
+  Y = fluid.tensor_array(len(L))
+  fluid.parallel_for(X, L, 
+                     func(index int) {  //// block 1
+                       x = X[index]
+                       fluid.send(L[index], x)
+                       y = fluid.recv(L[index])
+                       Y[index] = y
+                     })
+}
+```
+
+An explanation of the above program:
+
+- `fluid.k8s` is a package that provides access to Kubernetes API.  
+- `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).  
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed, 
+
+  1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
+  2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread  
+     1. creates an Executor instance, and
+     2. calls `Executor.Run(block)`, where `block` is block 1 as explained above.
+1. Please be aware that block 1 is a sub-block of block 0, so ops in block 1 could refer to variables defined in block 0.
+
+### The Worker Program
+
+The worker program looks like
+
+```go
+func main() {
+  W = Tensor(...)
+  x = fluid.listen_and_do(
+        fluid.k8s.self_addr(),
+        func(input Tensor) {
+          output = fluid.mult(input, W)
+        })
+}
+```
+
+where
+
+- `fluid.listen_and_do` creates a `ListenAndDo` intrinsic, which, when executed,
+  1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`,
+  2. once a connection is established,
+     1. creates a scope of two parameters, "input" and "output",
+     2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h) and saves it into "input",
+     3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`.
+
+## Summarization
+
+From the above example, we see that:
+
+1. Fluid enables the imperative programming paradigm by:
+   1. letting users describe a program, but not a model (a sequence of layers, or a graph of operators), and
+   2. call the `fluid.run` function that runs the program implicitly.
+1. The program is described as a `ProgramDesc` protobuf message.
+2. Function `Executor.Run` takes a block, instead of a `ProgramDesc`, as its parameter.
+3. `fluid.run` calls `Executor.Run` to run the first block in the `ProgramDesc` message.
+4. `Executor.Run`'s implementation is extremely simple -- it doesn't plan the execution nor create threads; instead, it runs on the current thread and execute intrinsics/operators' `Run` method sequentially as they appear in the `Block.ops` array.
+5. Intrinsics/operators' `Run` method might create threads.  For example, the `ListenAndDo` operator creates a thread to handle each incoming request.
+6. Threads are not necessarily OS thread; instead, they could be [green threads](https://en.wikipedia.org/wiki/Green_threads) managed by ThreadPool.  Multiple green threads might run on the same OS thread.  An example green threads is Go's [goroutines](https://tour.golang.org/concurrency/1).
diff --git a/doc/design/images/control_flow_graph.png b/doc/design/images/control_flow_graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3579998e58d07abc50bd3332128d4733a391cb3b
Binary files /dev/null and b/doc/design/images/control_flow_graph.png differ
diff --git a/doc/design/images/dataflow_equations.png b/doc/design/images/dataflow_equations.png
new file mode 100644
index 0000000000000000000000000000000000000000..c10f7f69f4007952e5b0394edaa04efa1cfbb658
Binary files /dev/null and b/doc/design/images/dataflow_equations.png differ
diff --git a/doc/design/images/deep_learning.png b/doc/design/images/deep_learning.png
new file mode 100644
index 0000000000000000000000000000000000000000..026becc4d94e01e407dacb2a5314a0e5723334ff
Binary files /dev/null and b/doc/design/images/deep_learning.png differ
diff --git a/doc/design/memory_optimization.md b/doc/design/memory_optimization.md
new file mode 100644
index 0000000000000000000000000000000000000000..00f514711a46bfd5af3bae51e0d9225ecc4c8998
--- /dev/null
+++ b/doc/design/memory_optimization.md
@@ -0,0 +1,217 @@
+# Memory Optimization
+
+
+## Problem
+
+In a lecture from Andrew Ng, he attributes the recent sucess of AI due to a combination of these:
+
+- availability of Big Data
+- supercomputing power to process this Big Data over very large neural networks
+- modern algorithms
+
+Following graph shows the details:
+
+![](images/deep_learning.png)
+
+Larger model usually brings better performance. However, GPU memory is certain limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large model, we have to take care of memory using. Besides, memory optimization is also necessary in both online/mobile inference. 
+
+## Solution
+
+### Basic Strategy
+
+There are some basic strategies to make memory optimization, including in-place operation and memory sharing.
+
+#### In-place Operation
+In a relu activation operator： 
+
+$y = \max(x, 0)$
+
+If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x are the same. In-place operation will save 50% memory occupancy immediately.
+
+#### Memory Sharing
+
+Not all operators support in-place operations. Memory sharing is a more general strategy.
+
+Following is an example:
+
+```
+a = op1(b, c);
+d = op2(a)
+e = op3(d, f)
+```
+
+In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finished, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
+
+
+### Live Variable Analysis
+
+It's not enough to only have some basic strategies. The prerequisite of memory optimization is to know if a variable is still "live" after an operation.
+
+In our design, the neural network topology is defined as a program. Luckily, [live variable analysis](https://en.wikipedia.org/wiki/Live_variable_analysis) is a classic problem in compilers which can be used in many stages, such as register allocation. 
+
+In compilers, the front end of the compilers translates programs into an intermediate language with an unbounded number of temporaries. This program must run on a machine with a bounded number of registers. Two temporaries a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporaries can fit in few registers; if they don't all fit, the excess temporaries can be kept in memory.
+
+Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporaries are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis. 
+
+We can leran these techniques from compilers. There are mainly two stages to make live variable analysis:
+
+- construct a control flow graph
+- solve the dataflow equations
+
+
+#### Control Flow Graph
+To preform analyses on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
+
+Following is the flow graph for a simple loop.
+
+![](images/control_flow_graph.png)
+
+#### Dataflow Analysis
+
+liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
+
+A simple way to perform data-flow analysis of programs is to set up dataflow equations for each node of the control flow graph and solve them by repeatedly calculating the output from the input locally at each node until the whole system stabilizes.
+
+- Flow Graph Terminology
+
+A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from presucessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
+In former control flow graph, the out-edges of node 5 are 5 --> 6 and 5 --> 2, and *succ[5]* = {2, 6}. The in-edges of 2 are 5 --> 2 and 1 --> 2, and *pred[2]* = {1, 5}.
+
+- Uses and Defs
+
+An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can speak the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
+
+- Liveness
+
+A variable is *live* on an edge if there is a directed path from that edge to a *use* of the variable that does not go through any *def*. A variable is *live-in* at a node if it is live on any of the in-edges of that node; it is *live-out* at a node if it is live on any of the out-edges of the node.
+
+
+The calcution of liveness can be solved by iteration until a fixed pointer is reached. Following is the recursive formula:
+
+![](images/dataflow_equations.png)
+
+### Memory optimization transpiler
+
+At last, we take basic strategy and liveness analysis techniques learning from compilers to implement our memory optimization transpiler.
+
+#### add in-place attribute
+
+In-place is a built-in attribute of an operator. Since we treat in-place and other operators differently, we have to add an in-place attribute for every operator.
+
+
+#### contruct control flow graph
+
+Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book/test_machine_translation.py) example.
+
+- Block0:
+
+```
+lookup_table
+mul
+...
+while(sub-block idx 1)
+...
+array_to_lod_tensor
+cross_entropy
+...
+while_grad(sub-block idx 2)
+read_from_array
+array_to_lod_tensor
+...
+```
+
+- Block1
+
+```
+read_from_array
+read_from_array
+...
+write_to_array
+increment
+write_to_array
+less_than
+```
+
+- Block2
+
+```
+read_from_array
+increment
+...
+write_to_array
+write_to_array
+```
+
+We can transfer all the operators and variables in ProgramDesc to build a control flow graph.
+
+```python
+class ControlFlowGraph(object):
+    def __init__(self, Program):
+        self._sucessors = defaultdict(set)
+        self._presucessors = defaultdict(set)
+        self._uses = defaultdict(set)
+        self._defs = defaultdict(set)
+        self._live_in = defaultdict(set)
+        self._live_out = defaultdict(set)
+        self._program = Program
+    
+    def build(self):
+        pass
+    
+    def dataflow_analysis(self):
+        pass
+        
+    def memory_optimization(self):
+        pass
+        
+    def get_program(self):
+        return self._program
+```
+
+#### make dataflow analysis
+
+We follow guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing. 
+
+For example:
+
+```
+a = op1(b, c);
+d = op2(a)
+e = op3(d, f)
+```
+
+The dataflow analysis result is:
+
+```
+live_in(op1) = {b, c, f}
+live_out(op1) = {a, f}
+
+live_in(op2) = {a, f}
+live_out(op2) = {d, f}
+
+live_in(op3) = {d, f}
+live_out(op3) = {}
+```
+
+After op1, we can process variable b and variable c; After op2, we can process variable a. After op3, we can process variable d and variable f.
+
+#### memory sharing policy
+
+A memory pool will be mantained in the stage of memory optimization. Each operator node will be scanned to determine memory optimization is done or not. If an operator satifies the requirement, following policy will be taken to handle input/output variables.
+
+```
+if op.support_inplace():
+    i --> pool
+    pool --> o
+else:
+    pool --> o
+    i --> pool
+```
+
+
+
+## Reference
+
+- [Lecture Notes From Artificial Intelligence Is The New Electricity By Andrew Ng](https://manavsehgal.com/lecture-notes-from-artificial-intelligence-is-the-new-electricity-by-andrew-ng-4712dcbf26e5)
+- Modern compiler implementation in ML, by Andrew W. Appel
+- [Optimizing Memory Consumption in Deep learning](https://mxnet.incubator.apache.org/architecture/note_memory.html)
diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md
index 14c081ea84282e52a2e36475c3c0ea755122d154..b9787261092f1f27377886152cb1596d9ff54188 100644
--- a/doc/design/releasing_process.md
+++ b/doc/design/releasing_process.md
@@ -7,11 +7,9 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
 1. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
 1. 对这个版本的提交，做如下几个操作:
+  * 使用Regression Test List作为检查列表，测试本次release的正确性。
+	  * 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，到第二步
 	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
-	* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
-	* 编译这个版本的Ubuntu Deb包。如果失败，修复Ubuntu Deb包编译问题，Patch号加一，返回第二步。
-	* 使用Regression Test List作为检查列表，测试Docker镜像/ubuntu安装包的功能正确性
-		* 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，返回第二步
 	* 编译这个版本的python wheel包，并发布到pypi。
 		* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
 		* pypi上的package名称为paddlepaddle和paddlepaddle_gpu，如果要上传GPU版本的包，需要修改build/python/setup.py中，name: "paddlepaddle_gpu"并重新打包wheel包：`python setup.py bdist_wheel`。
@@ -21,8 +19,8 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 			pip install twine
 			twine upload dist/[package to upload]
 			```
+		* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
 1. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-1. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
 1. 协同完成Release Note的书写
 
 
@@ -31,6 +29,30 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 * `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试PaddlePaddle的行为。
 * 在`release/版本号`分支存在的时候，如果有bugfix的行为，需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
 
+## 发布wheel包到pypi
+
+使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+完成自动化二进制编译，参考下图，选择需要发布的版本（通常包含一个CPU版本和一个GPU版本），点击"run"右侧的"..."按钮，可以
+弹出下面的选择框，在第二个tab (Changes)里选择需要发布的分支，这里选择0.11.0，然后点击"Run Build"按钮。等待编译完成后
+可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。然后按照上述的方法
+使用`twine`工具上传即可。
+
+<img src="ci_build_whl.png">
+
+* 注：CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
+  发型版，如果需要手动编译，也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
+* pypi不支持覆盖上传，所以一个版本号的wheel包发布之后，不可以更改。下一个wheel包需要更新版本号才可以上传。
+
+## 发布Docker镜像
+
+上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub，所以，发布Docker镜像只需要对自动push的镜像打上
+版本号对应的tag即可：
+
+1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。
+1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`，latest tag可以是latest或latest-gpu等。
+1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]`
+1. 执行 `docker push paddlepaddle/paddle:[version]`
+
 ## PaddlePaddle 分支规范
 
 PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。
diff --git a/doc/design/support_new_device.md b/doc/design/support_new_device.md
index f54b2b3694cc2a8f1d892792fd4d39a0484dc750..4c5f10e2ecb9ec09b78926ca27552741d02d7cc9 100644
--- a/doc/design/support_new_device.md
+++ b/doc/design/support_new_device.md
@@ -48,8 +48,8 @@ Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/dev
 
 
 ```
-                /->  CPUDeviceContext   --> MKLDeviceContext
-DeviceContext ---->  CUDADeviceContext  --> CUDNNDeviceContext
+                /->  CPUDeviceContext   
+DeviceContext ---->  CUDADeviceContext  
                 \->  FPGADeviceContext
 ```
 
@@ -79,16 +79,6 @@ private:
 };
 ```
 
-- CUDNNDeviceContext
-
-```
-class CUDNNDeviceContext : public CUDADeviceContext {
-  private:
-    cudnnHandle_t cudnn_handle_;
-};
-```
-
-
 ### Memory and Tensor
 
 
diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
index 424d7718c64438496cf0895397babd5408e1ca02..ae24ced770492743065e37654b494caf6b4c5bc0 100644
--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -1,8 +1,9 @@
 # Android平台编译指南
 
 用户可通过如下两种方式，交叉编译Android平台上适用的PaddlePaddle库：
-- 基于Docker容器的编译方式
-- 基于Linux交叉编译环境的编译方式
+
+- [基于Docker容器的编译方式](#基于docker容器的编译方式)
+- [基于Linux交叉编译环境的编译方式](#基于linux交叉编译环境的编译方式)
 
 ## 基于Docker容器的编译方式
 Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行，因此，使用基于Docker容器的编译方式，用户可在自己熟悉的开发平台上编译Android平台上适用的PaddlePaddle库。
@@ -16,6 +17,12 @@ $ cd Paddle
 $ docker build -t username/paddle-android:dev . -f Dockerfile.android
 ```
 
+用户也可以使用PaddlePaddle提供的官方开发镜像：
+
+```bash
+$ docker pull paddlepaddle/paddle:latest-dev-android
+```
+
 ### 编译PaddlePaddle C-API库
 构建好开发镜像后，即可使用开发镜像来编译Android版PaddlePaddle C-API库。
 Android的Docker开发镜像向用户提供两个可配置的参数：
@@ -41,23 +48,25 @@ Android的Docker开发镜像向用户提供两个可配置的参数：
 </tr>
 <tr class="row-odd">
   <td>ANDROID_API</td>
-  <td>>= 21</td>
+  <td>>= 16</td>
   <td>21</td>
 </tr>
 </tbody>
 </table>
 
 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
+
   ```bash
   $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
   ```
 
 - 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库
+
   ```bash
   $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
   ```
 
-执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文**配置交叉编译参数**章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
+执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
 
 ## 基于Linux交叉编译环境的编译方式
 本文档将以Linux x86-64平台为例，介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
@@ -83,6 +92,7 @@ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain
 此命令将在`your/path/to/arm_standalone_toolchain`目录生成一套独立编译工具链，面向架构为32位ARM架构，支持的最小的Android API级别为21，支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
 
 - 构建`arm64-v8a`、 `Android API 21`的独立工具链：
+
 ```bash
 your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
         --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
@@ -90,14 +100,12 @@ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain
 
 此命令将在`your/path/to/arm64_standalone_toolchain`目录生成一套独立编译工具链，面向架构为64位ARM64架构，支持的最小Android API级别为21，支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
 
-注意：**PaddlePaddle要求使用的编译工具链所支持的Android API级别不小于21**。
-
 ### 配置交叉编译参数
 
 CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置，PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake)，以提供一些默认的编译器和编译参数相关配置。注意，从CMake 3.7版本开始，CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时，将会将用户传进来的配置参数传递CMake系统，交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。
 
 交叉编译Android版本的PaddlePaddle库时，有一些必须配置的参数：
-- `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后，PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本，并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及Android所需`arm_soft_fp_abi`分支的目标机版OpenBLAS库。此外，还会强制设置一些PaddlePaddle参数的值（`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`）。
+- `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后，PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本，并自动编译PaddlePaddle所需的所有第三方库。此外，还会强制设置一些PaddlePaddle参数的值（`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`、`WITH_MKL=OFF`、`WITH_GOLANG=OFF`）。
 - `WITH_C_API`，必须设置为`ON`。在Android平台上只支持使用C-API来预测。
 - `WITH_SWIG_PY`，必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。
 
@@ -119,7 +127,7 @@ Android平台可选配置参数：
 其他配置参数：
 
 - `USE_EIGEN_FOR_BLAS`，是否使用Eigen库进行矩阵计算。可设置`ON/OFF`，默认值为`OFF`。
-- `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值；若环境变量`CC`没有设置，则设置成`cc`编译器。
+- `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC/CXX`的值；若环境变量`CC/CXX`没有设置，则设置成`cc/c++`编译器。
 
 常用的cmake配置如下：
 
@@ -147,9 +155,10 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
       ..
 ```
 
-用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
+用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
 
 **性能TIPS**，为了达到最快的计算速度，在CMake参数配置上，有以下建议：
+
 - 设置`CMAKE_BUILD_TYPE`为`Release`
 - 使用`clang`编译工具链
 - `armeabi-v7a`时，设置`USE_EIGEN_BLAS=ON`，使用Eigen进行矩阵计算；`arm64-v8a`时，设置`USE_EIGEN_FOR_BLAS=OFF`，使用OpenBLAS进行矩阵计算
diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md
index 26858581fc1d77a9391520ac0dfd80fbd98f508c..0cf50181df4116beda3aa6faf836eda92edf6066 100644
--- a/doc/mobile/cross_compiling_for_android_en.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -1,6 +1,9 @@
 # Build PaddlePaddle for Android
 
-There are two approaches to build PaddlePaddle for Android: using Docker and on Linux without Docker. 
+There are two approaches to build PaddlePaddle for Android: 
+
+- [Cross-Compiling Using Docker](#cross-compiling-using-docker)
+- [Cross-Compiling on Linux](#cross-compiling-on-linux) 
 
 ## Cross-Compiling Using Docker
 
@@ -16,6 +19,12 @@ $ cd Paddle
 $ docker build -t paddle:dev-android . -f Dockerfile.android
 ```
 
+Users can directly use the published Docker image.
+
+```bash
+$ docker pull paddlepaddle/paddle:latest-dev-android
+```
+
 ### Build the Inference Library
 
 We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
@@ -47,7 +56,7 @@ The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
 </tr>
 <tr class="row-odd">
   <td>ANDROID_API</td>
-  <td>>= 21</td>
+  <td>>= 16</td>
   <td>21</td>
 </tr>
 </tbody>
@@ -93,15 +102,13 @@ Android NDK includes everything we need to build the [*standalone toolchain*](ht
 
   The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.
 
-**Please be aware that the minimum level of Android API required by PaddlePaddle is 21.**
-
 ### Cross-Compiling Arguments
 
 CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling).  PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake.  `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling).
 
 Some other CMake arguments you need to know:
 
-- `CMAKE_SYSTEM_NAME` must be `Android`.  This tells PaddlePaddle's CMake system to cross-compile third-party dependencies.  This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, and `WITH_RDMA=OFF`.
+- `CMAKE_SYSTEM_NAME` must be `Android`.  This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, `WITH_RDMA=OFF`, `WITH_MKL=OFF` and `WITH_GOLANG=OFF`.
 - `WITH_C_API` must be `ON`, to build the C-based inference library for Android.
 - `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API.
 
@@ -123,7 +130,7 @@ Some Android-specific arguments:
 Other useful arguments:
 
 - `USE_EIGEN_FOR_BLAS`: indicates if using Eigen.  Could be `ON` or `OFF`, defaults to `OFF`.
-- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS.  It defaults to the value of the environment variable `CC`, or `cc`.
+- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS.  It defaults to the value of the environment variable `CC/C++`, or `cc/c++`.
 
 Some frequent configurations for your reference:
 
@@ -158,6 +165,7 @@ There are some other arguments you might want to configure.
 - `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance.
 
 Our own tip for performance optimization to use clang and Eigen or OpenBLAS:
+
 - `CMAKE_BUILD_TYPE=Release`
 - `ANDROID_TOOLCHAIN=clang`
 - `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`.
diff --git a/doc/mobile/cross_compiling_for_ios_en.md b/doc/mobile/cross_compiling_for_ios_en.md
index aa390cd61f3fbd75e5a3b342f3559e76da35a918..19bfe86c511c7e43b462f94c8cabba420b3007f1 100644
--- a/doc/mobile/cross_compiling_for_ios_en.md
+++ b/doc/mobile/cross_compiling_for_ios_en.md
@@ -1,4 +1,4 @@
-# PaddlePaddle Compiling Guide for iOS
+# Build PaddlePaddle for iOS
 
 This tutorial will walk you through cross compiling the PaddlePaddle library for iOS from the source in MacOS.
 
@@ -98,7 +98,7 @@ You can set other compiling parameters for your own need. I.E. if you are trying
 - set `CMAKE_BUILD_TYPE` with `Release`
 - set `IOS_USE_VECLIB_FOR_BLAS` with `ON`
 
-## Compile and install
+## Build and install
 
 After CMake, run following commands, PaddlePaddle will download the compile 3rd party dependencies, compile and install PaddlePaddle inference library.
 
@@ -109,7 +109,7 @@ $ make install
 
 Please Note: if you compiled PaddlePaddle in the source directory for other platforms, do remove `third_party` and `build` directory within the source with `rm -rf` to ensure that all the 3rd party libraries dependencies and PaddlePaddle is newly compiled with current CMake configuration.
 
-`your/path/to/install` directory will have following directories after `compile` and `install`:
+`your/path/to/install` directory will have following directories after `make install`:
 
 - `include`, contains all the C-API header files.
 - `lib`, contains PaddlePaddle C-API static library.
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 7d2becbdd772747d77890321fce6721d8d17fb30..4a98ede278fad85ff2beef3c8e7dd158912f693a 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -24,6 +24,7 @@ else()
     add_subdirectory(framework)
     add_subdirectory(operators)
     add_subdirectory(pybind)
+    add_subdirectory(inference)
   endif()
 
   if(WITH_SWIG_PY)
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index b4458eb9551724021636b628c5bf8c96f6e659aa..46439c2d2775d9de1e2bcc0cddd27b75d8bc9eb6 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -26,10 +26,15 @@ nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
 
 cc_test(variable_test SRCS variable_test.cc)
 
-cc_library(scope SRCS scope.cc DEPS glog)
+cc_library(threadpool SRCS threadpool.cc)
+cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
+
+cc_library(scope SRCS scope.cc DEPS glog threadpool)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 
-cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto)
+cc_library(device_data_transform SRCS device_data_transform.cc DEPS tensor)
+
+cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto selected_rows device_data_transform)
 cc_test(data_transform_test SRCS data_transform_test.cc DEPS data_transform device_context)
 
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
@@ -38,7 +43,7 @@ device_context)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
-cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute)
+cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
     shape_inference data_transform)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
@@ -70,9 +75,10 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 
-cc_library(threadpool SRCS threadpool.cc)
-cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
-cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece)
+cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator)
 cc_test(init_test SRCS init_test.cc DEPS init)
 
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
+
+nv_test(device_data_transform_test SRCS device_data_transform_test.cu
+        DEPS operator op_registry init math_function)
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index eaf13ddcefcd8dc5a6b0438f765d8d325925aa30..85e693434af863bfc3bde29989dbbfc69678d3b7 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -427,7 +427,8 @@ std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
     VLOG(5) << "Making backward " << (*it)->Type() << " op";
     std::vector<std::unique_ptr<OpDesc>> op_grads;
 
-    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
+    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while" ||
+        (*it)->Type() == "parallel_do") {
       int step_block_idx = (*it)->GetBlockAttr("sub_block");
       BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
                                                   grad_to_var, step_block_idx);
diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc
index 9d6a8424426a68ae66cf93b803c35e33e30226f2..fed958db1584c4fda5394d59a2ef8936045a9ce9 100644
--- a/paddle/framework/data_transform.cc
+++ b/paddle/framework/data_transform.cc
@@ -11,9 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <functional>
 
 #include "paddle/framework/data_transform.h"
+#include "paddle/framework/device_data_transform.h"
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/selected_rows.h"
 #include "paddle/platform/device_context.h"
 
 namespace paddle {
@@ -24,6 +27,37 @@ DataTransformFnMap& DataTransformFnMap::Instance() {
   return data_transform_map;
 }
 
+Tensor* DataTransform(const OpKernelType& expected_kernel_type,
+                      const OpKernelType& kernel_type_for_var,
+                      const Tensor& input_tensor) {
+  Tensor* out = nullptr;
+  if (!platform::is_same_place(kernel_type_for_var.place_,
+                               expected_kernel_type.place_)) {
+    out = DeviceTransform(input_tensor, expected_kernel_type.place_);
+  }
+  PADDLE_ENFORCE_NOT_NULL(out, "out should not be null");
+  return out;
+}
+
+void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
+                            Variable& out_var) {
+  if (in_var.IsType<LoDTensor>()) {
+    auto& in_lod_tensor = in_var.Get<LoDTensor>();
+    auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
+    tran_lod_tensor->set_lod(in_lod_tensor.lod());
+    tran_lod_tensor->set_layout(in_lod_tensor.layout());
+    tran_lod_tensor->ShareDataWith(tensor);
+  } else if (in_var.IsType<SelectedRows>()) {
+    auto& in_selected_rows = in_var.Get<SelectedRows>();
+    auto* trans_selected_rows = out_var.GetMutable<SelectedRows>();
+    trans_selected_rows->set_height(in_selected_rows.height());
+    trans_selected_rows->set_rows(in_selected_rows.rows());
+    trans_selected_rows->mutable_value()->ShareDataWith(tensor);
+  } else {
+    PADDLE_THROW("unknown var type");
+  }
+}
+
 auto KernelFP32 = OpKernelType(proto::DataType::FP32, platform::CPUPlace(),
                                DataLayout::kNHWC, LibraryType::kPlain);
 
@@ -36,6 +70,28 @@ auto KernelNHWC = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
 auto KernelNCHW = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
                                DataLayout::kNCHW, LibraryType::kPlain);
 
+// TODO(dzhwinter): Only for testing multiple op kernel.
+// Dummy transform function for library_type
+// should be removed.
+auto KernelPlain = OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0),
+                                DataLayout::kAnyLayout, LibraryType::kPlain);
+
+auto KernelCUDNN = OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0),
+                                DataLayout::kAnyLayout, LibraryType::kCUDNN);
+
+void DummyTrans(const platform::DeviceContext* ctx,
+                const KernelTypePair& kernel_pair, const Variable& in,
+                Variable* out) {
+  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
+  PADDLE_ENFORCE(
+      platform::places_are_same_class(kernel_pair.first.place_,
+                                      kernel_pair.second.place_),
+      "TransDataType Only Support DataType transform on same place!");
+  auto src = in.Get<Tensor>();
+  auto* dst = out->GetMutable<Tensor>();
+  *dst = src;
+}
+
 void TransDataType(const platform::DeviceContext* ctx,
                    const KernelTypePair& kernel_pair, const Variable& in,
                    Variable* out) {
@@ -74,35 +130,36 @@ void TransDataType(const platform::DeviceContext* ctx,
   }
 }
 
-void TransDataLayout(const platform::DeviceContext* ctx,
+void TransDataLayout(const std::vector<int>& axis,
+                     const platform::DeviceContext* ctx,
                      const KernelTypePair& kernel_pair, const Variable& in,
                      Variable* out) {
-  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
+  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only support Tensor transform!.");
   PADDLE_ENFORCE(
       platform::places_are_same_class(kernel_pair.first.place_,
                                       kernel_pair.second.place_),
-      "TransDataType Only Support DataType transform on same place!");
+      "TransDataLayout only support DataLayout transform on same place!");
+  PADDLE_ENFORCE(kernel_pair.first.data_type_ == kernel_pair.second.data_type_,
+                 "TransDataLayout only support Datatype are same!");
 
   auto src = in.Get<Tensor>();
   auto* dst = out->GetMutable<Tensor>();
   PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!");
 
   auto src_dim = src.dims();
-  dst->Resize(src_dim);
-  auto place = kernel_pair.second.place_;
-  CopyFrom(src, place, *ctx, dst);
-  const std::vector<int> axis = {0, 2, 3, 1};
-
   std::vector<int64_t> dst_dim;
+
   dst_dim.resize(axis.size());
   for (size_t i = 0; i < axis.size(); i++) {
     dst_dim[i] = src_dim[axis[i]];
   }
 
   dst->Resize(make_ddim(dst_dim));
+  auto place = kernel_pair.second.place_;
+  dst->mutable_data(place, src.type());
 
   auto src_type = kernel_pair.first.data_type_;
-  framework::VisitDataType(src_type, CastDataLayout(src, dst, ctx, axis));
+  framework::VisitDataType(src_type, CastDataLayout(ctx, axis, src, dst));
 
   dst->set_layout(kernel_pair.second.data_layout_);
 }
@@ -111,5 +168,24 @@ void TransDataLayout(const platform::DeviceContext* ctx,
 }  // namespace paddle
 
 namespace f = paddle::framework;
+
+namespace {
+std::vector<int> NHWC2NCHW = {0, 3, 1, 2};
+std::vector<int> NCHW2NHWC = {0, 2, 3, 1};
+}
+
 REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType);
-REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW, f::TransDataLayout);
+REGISTER_DATA_TRANSFORM_FN(f::KernelPlain, f::KernelCUDNN, f::DummyTrans);
+REGISTER_DATA_TRANSFORM_FN(f::KernelCUDNN, f::KernelPlain, f::DummyTrans);
+REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW,
+                           std::bind(f::TransDataLayout, NHWC2NCHW,
+                                     std::placeholders::_1,
+                                     std::placeholders::_2,
+                                     std::placeholders::_3,
+                                     std::placeholders::_4));
+REGISTER_DATA_TRANSFORM_FN(f::KernelNCHW, f::KernelNHWC,
+                           std::bind(f::TransDataLayout, NCHW2NHWC,
+                                     std::placeholders::_1,
+                                     std::placeholders::_2,
+                                     std::placeholders::_3,
+                                     std::placeholders::_4));
diff --git a/paddle/framework/data_transform.h b/paddle/framework/data_transform.h
index 9abb3c99bf30fcf9deab59dc7ee9c02e7c7c775b..42fc5f4d7e84a0f62092c423524aae518f348a97 100644
--- a/paddle/framework/data_transform.h
+++ b/paddle/framework/data_transform.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/framework/op_kernel_type.h"
+#include "paddle/framework/selected_rows.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/framework/variable.h"
 #include "paddle/operators/math/math_function.h"
@@ -49,6 +50,13 @@ struct KernelTypePairHash {
   }
 };
 
+Tensor* DataTransform(const OpKernelType& expected_kernel_type,
+                      const OpKernelType& kernel_type_for_var,
+                      const Tensor& input_tensor);
+
+void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
+                            Variable& out_var);
+
 template <typename InType, typename OutType>
 struct CastDataTypeFunctor {
   HOSTDEVICE inline OutType operator()(InType in) const {
@@ -73,6 +81,7 @@ struct CastDataType {
     auto numel = in_.numel();
     auto* in_end = in_begin + numel;
     auto* out_begin = out_->mutable_data<OutType>(place);
+
     if (platform::is_cpu_place(place)) {
       platform::Transform<platform::CPUDeviceContext> trans;
       auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
@@ -86,9 +95,9 @@ struct CastDataType {
 };
 
 struct CastDataLayout {
-  CastDataLayout(const framework::Tensor& in, framework::Tensor* out,
-                 const platform::DeviceContext* ctx,
-                 const std::vector<int>& axis)
+  CastDataLayout(const platform::DeviceContext* ctx,
+                 const std::vector<int>& axis, const framework::Tensor& in,
+                 framework::Tensor* out)
       : in_(in), out_(out), ctx_(ctx), axis_(axis) {}
   const framework::Tensor in_;
   framework::Tensor* out_;
@@ -98,6 +107,7 @@ struct CastDataLayout {
   template <typename T>
   void operator()() {
     auto place = ctx_->GetPlace();
+
     if (platform::is_cpu_place(place)) {
       operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
       auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
diff --git a/paddle/framework/data_transform_test.cc b/paddle/framework/data_transform_test.cc
index 8665b6248faa2d218230449c45a10f022f3fbf4f..edd305fd17ae202926b83fbec10089719baa2e16 100644
--- a/paddle/framework/data_transform_test.cc
+++ b/paddle/framework/data_transform_test.cc
@@ -106,7 +106,7 @@ TEST(DataTransform, Register) {
   ASSERT_EQ(test_value, 2);
 }
 
-TEST(DataTransform, Layout) {
+TEST(DataTransform, DataLayout) {
   using namespace paddle::framework;
   using namespace paddle::platform;
 
@@ -127,7 +127,19 @@ TEST(DataTransform, Layout) {
   }
 
   Tensor dst = out.Get<Tensor>();
-  EXPECT_TRUE(dst.layout() != src->layout());
+
+  EXPECT_TRUE(dst.layout() == DataLayout::kNCHW);
+  EXPECT_TRUE(dst.dims() == make_ddim({2, 2, 3, 1}));
+
+  {
+    auto kernel1 = GenFromBit({1, 0, 1, 0});
+    auto kernel2 = GenFromBit({1, 0, 0, 0});
+    auto pair0 = std::make_pair(kernel1, kernel2);
+    instance.Get(pair0)(ctx, pair0, out, &in);
+  }
+
+  EXPECT_TRUE(src->layout() == DataLayout::kNHWC);
+  EXPECT_TRUE(src->dims() == make_ddim({2, 3, 1, 2}));
 }
 
 TEST(DataTransform, DataType) {
diff --git a/paddle/framework/device_data_transform.cc b/paddle/framework/device_data_transform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f9b7e96a284c148ca6a5e141d513342c92df3d4
--- /dev/null
+++ b/paddle/framework/device_data_transform.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/device_data_transform.h"
+
+namespace paddle {
+namespace framework {
+
+static const platform::DeviceContext* GetDeviceContext(
+    const platform::Place& src_place, const platform::Place& dst_place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+
+  if (platform::is_gpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    return pool.Get(src_place);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    return pool.Get(dst_place);
+  } else {
+    PADDLE_THROW(
+        "Currently, model parallelism is only supported between CPU and CUDA");
+  }
+}
+
+Tensor* DeviceTransform(const Tensor& in, const platform::Place& dst_place) {
+  VLOG(3) << "DeviceTransform in, src_place " << in.place()
+          << " dst_place: " << dst_place;
+  Tensor* out = new Tensor();
+  auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
+  dev_ctx->Wait();
+  CopyFrom(in, dst_place, *dev_ctx, out);
+  dev_ctx->Wait();
+  return out;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/device_data_transform.h b/paddle/framework/device_data_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..bebf0d1b320183f46ab226dc6493ba09a365fc35
--- /dev/null
+++ b/paddle/framework/device_data_transform.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+Tensor* DeviceTransform(const Tensor& in, const platform::Place& dst_place);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/device_data_transform_test.cu b/paddle/framework/device_data_transform_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e9100053d520184e716bcaa04ac348f03018b744
--- /dev/null
+++ b/paddle/framework/device_data_transform_test.cu
@@ -0,0 +1,168 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_info.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/elementwise_op_function.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
+class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input1 of test op");
+    AddOutput("output", "output of test op");
+    AddAttr<bool>("use_gpu", "force to use gpu kernel").SetDefault(false);
+    AddComment("This is test op");
+  }
+};
+
+class TestOpWithKernel : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+  OpKernelType GetExpectedKernelType(
+      const ExecutionContext& ctx) const override {
+    if (Attr<bool>("use_gpu")) {
+      VLOG(3) << "force use gpu kernel";
+      return OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0));
+    } else {
+      VLOG(3) << "use default kernel";
+      return OpKernelType(proto::DataType::FP32,
+                          ctx.Input<Tensor>("input")->place());
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TestKernel : public OpKernel<float> {
+ public:
+  void Compute(const ExecutionContext& ctx) const {
+    std::cout << ctx.op().DebugString() << std::endl;
+
+    const Tensor* input = ctx.Input<Tensor>("input");
+
+    std::cout << "input place:" << input->place() << std::endl;
+    auto* output = ctx.Output<framework::LoDTensor>("output");
+    output->Resize(input->dims());
+    output->mutable_data<T>(ctx.GetPlace());
+
+    operators::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
+        input, input, output, ctx.template device_context<DeviceContext>(),
+        AddFunctor<T>());
+    functor.Run();
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(
+    test_op, paddle::framework::TestOpWithKernel,
+    paddle::framework::OpKernelTestProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(
+    test_op,
+    paddle::framework::TestKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    test_op,
+    paddle::framework::TestKernel<paddle::platform::CUDADeviceContext, float>);
+
+static void BuildVar(const std::string& param_name,
+                     std::initializer_list<const char*> arguments,
+                     paddle::framework::proto::OpDesc::Var* var) {
+  var->set_parameter(param_name);
+  for (auto& arg_name : arguments) {
+    *var->mutable_arguments()->Add() = arg_name;
+  }
+}
+
+TEST(Operator, CPUtoGPU) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  ASSERT_EQ(InitDevices({"CPU", "GPU:0"}), true);
+
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace cpu_place;
+
+  // create an op to run on CPU
+  paddle::framework::proto::OpDesc cpu_op_desc;
+  cpu_op_desc.set_type("test_op");
+  BuildVar("input", {"IN1"}, cpu_op_desc.add_inputs());
+  BuildVar("output", {"OUT1"}, cpu_op_desc.add_outputs());
+
+  auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc);
+  // prepare input
+  auto* in_t = scope.Var("IN1")->GetMutable<LoDTensor>();
+  auto* src_ptr = in_t->mutable_data<float>({2, 3}, CPUPlace());
+  for (int i = 0; i < 2 * 3; ++i) {
+    src_ptr[i] = static_cast<float>(i);
+  }
+
+  // get output
+  auto* output = scope.Var("OUT1");
+  cpu_op->Run(scope, cpu_place);
+
+  auto* output_ptr = output->Get<LoDTensor>().data<float>();
+  for (int i = 0; i < 2 * 3; ++i) {
+    ASSERT_EQ(output_ptr[i], static_cast<float>(i) * 2);
+  }
+
+  // create an op to run on GPU
+  paddle::framework::proto::OpDesc gpu_op_desc;
+  gpu_op_desc.set_type("test_op");
+  BuildVar("input", {"OUT1"}, gpu_op_desc.add_inputs());
+  BuildVar("output", {"OUT2"}, gpu_op_desc.add_outputs());
+
+  auto attr = gpu_op_desc.mutable_attrs()->Add();
+  attr->set_name("use_gpu");
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
+  attr->set_b(true);
+
+  auto gpu_op = paddle::framework::OpRegistry::CreateOp(gpu_op_desc);
+
+  paddle::platform::CUDAPlace cuda_place(0);
+  // get output
+  auto* output2 = scope.Var("OUT2");
+  gpu_op->Run(scope, cuda_place);
+
+  // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  auto dev_ctx = pool.Get(cuda_place);
+
+  paddle::framework::Tensor output_tensor;
+  CopyFrom(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
+           &output_tensor);
+
+  dev_ctx->Wait();
+  float* output2_ptr = output_tensor.data<float>();
+  for (int i = 0; i < 2 * 3; ++i) {
+    ASSERT_EQ(output2_ptr[i], static_cast<float>(i) * 4);
+  }
+}
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index bf1f0471ccbfccf13cb6f74c8088da7acd68ec0b..844d98916ea5b1ffd88615825d79af37ba7d128e 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -111,7 +111,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
 
   for (auto& op_desc : block.AllOps()) {
     auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
-    VLOG(3) << op->DebugString();
+    VLOG(3) << op->DebugStringEx(local_scope);
     op->Run(*local_scope, place_);
     if (FLAGS_check_nan_inf) {
       for (auto& vname : op->OutputVars(true)) {
diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc
index 682cff168d4d31e0565fc987604f97a671566fbd..7ec8d18b0e886948f4fb951e17875584413771db 100644
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/framework/init.h"
+#include "paddle/framework/operator.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/place.h"
 #include "paddle/string/piece.h"
@@ -24,7 +25,6 @@ namespace framework {
 
 std::once_flag gflags_init_flag;
 
-// TODO(qijun) move init gflags to init.cc
 void InitGflags(std::vector<std::string> &argv) {
   std::call_once(gflags_init_flag, [&]() {
     int argc = argv.size();
@@ -72,8 +72,14 @@ bool InitDevices(const std::vector<std::string> &devices) {
     LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
   }
   platform::DeviceContextPool::Init(places);
+  framework::UseALL();
   return true;
 }
 
+void InitGLOG(const std::string &prog_name) {
+  google::InitGoogleLogging(prog_name.c_str());
+  google::InstallFailureSignalHandler();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/init.h b/paddle/framework/init.h
index 33907f9eb00fb3469b53dcf8151557cc7a2d3791..9c84a03ded52632047841f95badbcf44bc9f48d1 100644
--- a/paddle/framework/init.h
+++ b/paddle/framework/init.h
@@ -22,6 +22,8 @@ namespace framework {
 
 void InitGflags(std::vector<std::string> &argv);
 
+void InitGLOG(const std::string &prog_name);
+
 bool InitDevices(const std::vector<std::string> &devices);
 
 }  // namespace framework
diff --git a/paddle/framework/library_type.h b/paddle/framework/library_type.h
index 7707799cae8c4edc304cd81725270a85f01fd28d..1e3084835439b0d55de72a669b93acbaef7ed6b9 100644
--- a/paddle/framework/library_type.h
+++ b/paddle/framework/library_type.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <cctype>
 
 namespace paddle {
 namespace framework {
@@ -41,6 +42,9 @@ inline std::string LibraryTypeToString(const LibraryType& library_type) {
 
 inline LibraryType StringToLibraryType(const char* ctype) {
   std::string s(ctype);
+  for (size_t i = 0; i < s.size(); ++i) {
+    s[i] = toupper(s[i]);
+  }
   if (s == std::string("PLAIN")) {
     return LibraryType::kPlain;
   } else if (s == std::string("MKLDNN")) {
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 7b6dc09bdb5535488c8c4dbc71c9cd6a7998bd0b..ef85ed69dbe87d4b5b1b1b5d6a04220a5266a635 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -43,6 +43,22 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
   return os;
 }
 
+std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
+  PADDLE_ENFORCE(platform::is_cpu_place(t.place()));
+  PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code());
+
+  os << "dim: " << t.dims() << "\n";
+  os << "lod: " << t.lod() << "\n";
+
+  // only print first ten elements
+  int64_t size = t.numel() < 10 ? t.numel() : 10;
+  for (int64_t i = 0; i < size; ++i) {
+    os << t.data<float>()[i] << " ";
+  }
+
+  return os;
+}
+
 LoD SliceLevels(const LoD &in, size_t level_begin, size_t level_end) {
   LoD new_lod;
   new_lod.reserve(level_end - level_begin);
@@ -177,6 +193,9 @@ void AppendLoD(LoD *lod, const LoD &lod_length) {
       lod->empty() || lod->size() == lod_length.size(),
       "The lod_length should has the same size with the appended lod.");
   if (lod->empty()) {
+    for (size_t i = 0; i < lod_length.size(); ++i) {
+      lod->emplace_back(1, 0);  // size = 1, value = 0;
+    }
     *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
   }
   for (size_t i = 0; i < lod->size(); ++i) {
@@ -214,9 +233,10 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
   SerializeToStream(os, static_cast<Tensor>(tensor), dev_ctx);
 }
 
-void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
+void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
+                           const platform::DeviceContext &dev_ctx) {
   {
-    // the 1st field, unit32_t version for SelectedRows
+    // the 1st field, unit32_t version for LoDTensor
     uint32_t version;
     is.read(reinterpret_cast<char *>(&version), sizeof(version));
     PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
@@ -237,7 +257,71 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
     }
   }
   // the 3st filed, Tensor
-  DeserializeFromStream(is, static_cast<Tensor *>(tensor));
+  DeserializeFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
+}
+
+std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
+    const std::vector<platform::Place> places) const {
+  check_memory_size();
+  //  PADDLE_ENFORCE(lod().empty() || (lod().size() == 1 && lod()[0].empty())
+  //                 , "Disable parallel lod for now");
+  PADDLE_ENFORCE(lod().empty(), "Disable parallel lod for now");
+  PADDLE_ENFORCE(dims()[0] % places.size() == 0,
+                 "Batch size should be divided by places size");
+
+  std::vector<LoDTensor> lods;
+  for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
+    size_t begin = place_idx * dims()[0] / places.size();
+    size_t end = (place_idx + 1) * dims()[0] / places.size();
+    auto src = Slice(static_cast<int>(begin), static_cast<int>(end));
+
+    LoDTensor dst;
+    dst.Resize(src.dims());
+    auto &dst_place = places[place_idx];
+    auto dst_ptr = dst.mutable_data(dst_place, src.type());
+
+    // TODO(tonyyang-svail):
+    //   change the following to framework::CopyFrom
+    auto src_place = src.place();
+    auto src_ptr = src.data<void>();
+    auto size = src.numel() * SizeOfType(src.type());
+    if (platform::is_cpu_place(src_place) &&
+        platform::is_cpu_place(dst_place)) {
+      memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                   boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+    } else {
+      PADDLE_THROW("Not Implemented");
+    }
+
+    lods.emplace_back(dst);
+  }
+
+  return lods;
+}
+
+void LoDTensor::MergeLoDTensor(
+    const std::vector<const LoDTensor *> &lod_tensors, platform::Place place) {
+  PADDLE_ENFORCE(platform::is_cpu_place(place));
+  PADDLE_ENFORCE(!lod_tensors.empty());
+
+  framework::DDim new_dim = lod_tensors[0]->dims();
+  std::type_index new_type = lod_tensors[0]->type();
+  for (auto *lod : lod_tensors) {
+    PADDLE_ENFORCE(new_dim == lod->dims());
+    PADDLE_ENFORCE(new_type == lod->type());
+    PADDLE_ENFORCE(platform::is_cpu_place(lod->place()));
+  }
+  new_dim[0] *= lod_tensors.size();
+  Resize(new_dim);
+
+  auto *dst_ptr = reinterpret_cast<uint8_t *>(mutable_data(place, new_type));
+  for (auto *src : lod_tensors) {
+    auto size = src->numel() * SizeOfType(src->type());
+    memory::Copy(boost::get<platform::CPUPlace>(place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src->place()),
+                 src->data<void>(), size);
+    dst_ptr += size;
+  }
 }
 
 }  // namespace framework
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 147db3ab0877662d9e47ae7ee6df05638b5fcbd1..b27936c1986d13d33786035adeebe14da92b190e 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -58,6 +58,7 @@ using Vector = thrust::host_vector<
 using LoD = std::vector<Vector<size_t>>;
 
 std::ostream& operator<<(std::ostream& os, const LoD& lod);
+std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
 
 /*
  * Slice levels from a LoD.
@@ -144,6 +145,12 @@ class LoDTensor : public Tensor {
    */
   void ShrinkInLevel(size_t level, size_t elem_begin, size_t elem_end);
 
+  std::vector<LoDTensor> SplitLoDTensor(
+      const std::vector<platform::Place> places) const;
+
+  void MergeLoDTensor(const std::vector<const LoDTensor*>& lod_tensors,
+                      platform::Place place);
+
  private:
   LoD lod_;
 };
@@ -208,7 +215,8 @@ void AppendLoD(LoD* lod, const LoD& lod_length);
  */
 void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
                        const platform::DeviceContext& dev_ctx);
-void DeserializeFromStream(std::istream& is, LoDTensor* tensor);
+void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
+                           const platform::DeviceContext& dev_ctx);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index 0747c8db531d6ae443d76591b945cce0c9bbea2b..0868c1f6e695b8de6a755f167951f404de0942ca 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -132,7 +132,7 @@ TEST_F(LoDTensorTester, SerializeAndDeserialize) {
   std::ostringstream oss;
   SerializeToStream(oss, lod_tensor_, cpu_ctx);
   std::istringstream iss(oss.str());
-  DeserializeFromStream(iss, &dst_tensor);
+  DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
   float* dst_ptr = dst_tensor.mutable_data<float>(platform::CPUPlace());
   for (int i = 0; i < kLodTensorSize; ++i) {
     EXPECT_EQ(dst_ptr[i], i);
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 3e58e6442edfe006c8aed238f67b9524783601ee..47c91290e4bf90897d35f1b3bce2e1f10ad0782c 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -64,8 +64,9 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarDesc::LOD_TENSOR,
                       "The %d-th output of Output(%s) must be LoDTensor.", j,
                       out);
-    out_var->SetLoDLevel(in_var->GetLodLevel());
+    out_var->SetLoDLevel(in_var->GetLoDLevel());
   }
+
   bool IsRuntime() const override;
 
  protected:
diff --git a/paddle/framework/op_kernel_type.h b/paddle/framework/op_kernel_type.h
index b06002096fb109da806809f7b908d9768cf095ba..053897784c1c4350deadf39e2a009220d38f65f9 100644
--- a/paddle/framework/op_kernel_type.h
+++ b/paddle/framework/op_kernel_type.h
@@ -26,13 +26,12 @@ namespace framework {
 struct OpKernelType {
   struct Hash {
     size_t operator()(const OpKernelType& key) const {
-      int place = key.place_.which() + (1 << LEFT_SHIFT);
-      int data_type =
-          static_cast<int>(key.data_type_) + (1 << (LEFT_SHIFT + 1));
-      int data_layout =
-          static_cast<int>(key.data_layout_) + (1 << (LEFT_SHIFT + 2));
-      int library_type =
-          static_cast<int>(key.library_type_) + (1 << (LEFT_SHIFT + 3));
+      int place = key.place_.which();
+      int data_type = static_cast<int>(key.data_type_) << LEFT_SHIFT;
+      int data_layout = static_cast<int>(key.data_layout_) << (LEFT_SHIFT * 2);
+      int library_type = static_cast<int>(key.library_type_)
+                         << (LEFT_SHIFT * 3);
+
       std::hash<int> hasher;
       return hasher(place + data_type + data_layout + library_type);
     }
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index bdaa25918155caca4b64b0ed60aa3f6be03eb12f..d75c0233e8e0134ddf4edc50c07490a234b65cd0 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -37,8 +37,8 @@ class Registrar {
  public:
   // In our design, various kinds of classes, e.g., operators and kernels,
   // have their corresponding registry and registrar. The action of
-  // registration is in the constructor of a global registrar variable, which,
-  // however, are not used in the code that calls package framework, and would
+  // registration is in the constructor of a global registrar variable, which
+  // are not used in the code that calls package framework, and would
   // be removed from the generated binary file by the linker. To avoid such
   // removal, we add Touch to all registrar classes and make USE_OP macros to
   // call this method. So, as long as the callee code calls USE_OP, the global
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index cef530c6e639f6e2188869fa57d114ec6b885aa8..f7a10ada809e6943e60c2d8cde05b8a9e2a7a2c2 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -12,13 +12,16 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
+#include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "paddle/framework/op_registry.h"
+
 namespace pd = paddle::framework;
 
 namespace paddle {
 namespace framework {
+
 class CosineOp : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
@@ -215,7 +218,7 @@ class OpWithKernelTest : public OperatorWithKernel {
  protected:
   void InferShape(InferShapeContext* ctx) const override {}
 
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(proto::DataType::FP32, ctx.device_context());
   }
@@ -252,7 +255,6 @@ TEST(OperatorRegistrar, CPU) {
   op->Run(scope, cpu_place);
 }
 
-#ifdef PADDLE_WITH_CUDA
 TEST(OperatorRegistrar, CUDA) {
   paddle::framework::proto::OpDesc op_desc;
   paddle::platform::CUDAPlace cuda_place(0);
@@ -263,4 +265,127 @@ TEST(OperatorRegistrar, CUDA) {
 
   op->Run(scope, cuda_place);
 }
-#endif
+
+static int op_test_value = 0;
+
+using paddle::platform::DeviceContext;
+using paddle::platform::CPUDeviceContext;
+using paddle::platform::CUDADeviceContext;
+
+namespace paddle {
+namespace framework {
+
+class OpWithMultiKernelTest : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        proto::DataType::FP32, platform::CUDAPlace(0), DataLayout::kAnyLayout,
+        framework::LibraryType::kCUDNN);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OpMultiKernelTest : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const;
+};
+
+template <typename T>
+class OpMultiKernelTest<CPUDeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    ++op_test_value;
+  }
+};
+
+template <typename T>
+class OpMultiKernelTest<CUDADeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    --op_test_value;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OpMultiKernelTest2 : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const;
+};
+
+template <typename T>
+class OpMultiKernelTest2<CPUDeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    op_test_value += 10;
+  }
+};
+
+template <typename T>
+class OpMultiKernelTest2<CUDADeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    op_test_value -= 10;
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(op_with_multi_kernel,
+                             paddle::framework::OpWithMultiKernelTest,
+                             paddle::framework::OpKernelTestMaker);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, CPU, paddle::platform::CPUPlace,
+    paddle::framework::OpMultiKernelTest<CPUDeviceContext, float>);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, MKLDNN, paddle::platform::CPUPlace,
+    paddle::framework::OpMultiKernelTest2<CPUDeviceContext, float>);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, CUDA, paddle::platform::CUDAPlace,
+    paddle::framework::OpMultiKernelTest<CUDADeviceContext, float>);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, CUDNN, paddle::platform::CUDAPlace,
+    paddle::framework::OpMultiKernelTest2<CUDADeviceContext, float>);
+
+TEST(OperatorRegistrar, OpWithMultiKernel) {
+  paddle::framework::proto::OpDesc op_desc;
+  paddle::platform::CUDAPlace cuda_place(0);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+
+  op_desc.set_type("op_with_multi_kernel");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+  // TODO(qiao) add priority back
+  // use all available kernels
+  paddle::framework::UseALL();
+  op->Run(scope, cuda_place);
+  EXPECT_EQ(op_test_value, -10);
+
+  // remove cuda kernels
+  paddle::framework::UseCPU();
+  op->Run(scope, cpu_place);
+
+  EXPECT_EQ(op_test_value, -20);
+
+  // add cuda kernels
+  paddle::framework::UseCUDA();
+  op->Run(scope, cuda_place);
+
+  EXPECT_EQ(op_test_value, -30);
+
+  // use cudnn kernel
+  paddle::framework::UseCUDNN();
+  op->Run(scope, cuda_place);
+  EXPECT_EQ(op_test_value, -40);
+}
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index fc7091f1c89f8b3f998f6d1b68f032b76bad2197..a1f1be5f34264c11e8125f78650d63d9996aea84 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -11,13 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <glog/logging.h>
 
 #include <algorithm>
-#include <atomic>
 
 #include "paddle/framework/data_transform.h"
+#include "paddle/framework/device_data_transform.h"
 #include "paddle/framework/executor.h"
-#include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/shape_inference.h"
 #include "paddle/framework/var_type.h"
@@ -25,6 +25,64 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
+
+void UseCPU() {
+  kKernelPriority.clear();
+  /*Plain CPU*/
+  auto pair0 = std::make_tuple(platform::CPUPlace(), LibraryType::kPlain);
+  kKernelPriority.insert(kKernelPriority.begin(), pair0);
+}
+
+void UseMKLDNN() {
+  UseCPU();
+#if PADDLE_WITH_MKLML
+  {
+    /*MKLDNN Kernel*/
+    auto pair0 = std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN);
+    kKernelPriority.insert(kKernelPriority.begin(), pair0);
+  }
+#endif
+}
+
+void UseCUDA() {
+  UseMKLDNN();
+#if PADDLE_WITH_CUDA
+  /*Plain GPU*/
+  auto pair0 = std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain);
+  kKernelPriority.insert(kKernelPriority.begin(), pair0);
+#endif
+}
+
+void UseCUDNN() {
+  UseCUDA();
+#if PADDLE_WITH_CUDA
+  if (platform::dynload::HasCUDNN()) {
+    /*CUDNN Kernel*/
+    auto pair0 = std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN);
+    kKernelPriority.insert(kKernelPriority.begin(), pair0);
+  }
+#endif
+}
+
+void UseALL() {
+  UseCPU();
+  UseMKLDNN();
+  UseCUDA();
+  UseCUDNN();
+}
+
+static DDim GetDims(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var->IsType<LoDTensor>()) {
+    return var->Get<LoDTensor>().dims();
+  } else if (var->IsType<SelectedRows>()) {
+    return var->Get<SelectedRows>().GetCompleteDims();
+  } else {
+    return DDim({-1});
+  }
+}
+
 std::string OperatorBase::Input(const std::string& name) const {
   auto& ins = Inputs(name);
   PADDLE_ENFORCE_LE(ins.size(), 1UL,
@@ -57,7 +115,7 @@ const std::vector<std::string>& OperatorBase::Outputs(
   return it->second;
 }
 
-std::string OperatorBase::DebugString() const {
+std::string OperatorBase::DebugStringEx(const Scope* scope) const {
   std::stringstream ss;
   ss << "Op(" << type_ << "), inputs:{";
   for (auto it = inputs_.begin(); it != inputs_.end();) {
@@ -65,6 +123,9 @@ std::string OperatorBase::DebugString() const {
     ss << input.first << "[";
     for (size_t i = 0; i < input.second.size(); ++i) {
       ss << input.second[i];
+      if (scope) {
+        ss << "(" << GetDims(*scope, input.second[i]) << ")";
+      }
       if (i != input.second.size() - 1) {
         ss << ", ";
       }
@@ -81,6 +142,9 @@ std::string OperatorBase::DebugString() const {
     ss << output.first << "[";
     for (size_t i = 0; i < output.second.size(); ++i) {
       ss << output.second[i];
+      if (scope) {
+        ss << "(" << GetDims(*scope, output.second[i]) << ")";
+      }
       if (i != output.second.size() - 1) {
         ss << ", ";
       }
@@ -178,6 +242,10 @@ void OperatorBase::GenerateTemporaryNames() {
   }
 }
 
+static bool VarIsTensor(const Variable* var) {
+  return var->IsType<LoDTensor>() || var->IsType<SelectedRows>();
+}
+
 static const Tensor* GetTensorFromVar(const Variable* var) {
   const Tensor* t = nullptr;
   if (var->IsType<LoDTensor>()) {
@@ -185,7 +253,8 @@ static const Tensor* GetTensorFromVar(const Variable* var) {
   } else if (var->IsType<SelectedRows>()) {
     t = &(var->Get<SelectedRows>().value());
   } else {
-    PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
+                 var->Type().name());
   }
   return t;
 }
@@ -197,7 +266,8 @@ static Tensor* GetMutableTensorFromVar(Variable* var) {
   } else if (var->IsType<SelectedRows>()) {
     t = var->GetMutable<SelectedRows>()->mutable_value();
   } else {
-    PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
+                 var->Type().name());
   }
   return t;
 }
@@ -347,6 +417,25 @@ class RuntimeInferShapeContext : public InferShapeContext {
     auto in_tensor = in_var->Get<LoDTensor>();
     auto* out_tensor = out_var->GetMutable<LoDTensor>();
     out_tensor->set_lod(in_tensor.lod());
+
+    // TODO(dzhwinter) : reuse ShareLoD in most operators.
+    // Need to call ShareLayout explicitly in sequence related ops.
+    // Shall we have a better method to shared info between in/out Tensor?
+    out_tensor->set_layout(in_tensor.layout());
+  }
+
+  void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
+                   size_t j = 0) const {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
+    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+    if (!in_var->IsType<LoDTensor>()) return;
+    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    auto in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
+    out_tensor->set_layout(in_tensor.layout());
   }
 
   bool IsRuntime() const override { return true; }
@@ -359,7 +448,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
     } else if (var->IsType<SelectedRows>()) {
       return var->Get<SelectedRows>().GetCompleteDims();
     } else {
-      PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
+                   name, var->Type().name());
     }
   }
 
@@ -370,7 +460,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
     } else if (var->IsType<SelectedRows>()) {
       var->GetMutable<SelectedRows>()->set_height(dim[0]);
     } else {
-      PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
+                   name, var->Type().name());
     }
   }
 
@@ -384,24 +475,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
   const Scope& scope_;
 };
 
-const platform::DeviceContext* GetDeviceContext(
-    framework::KernelTypePair& kernel_pair) {
-  auto& actual_kernel_key = kernel_pair.first;
-  auto& expected_kernel_key = kernel_pair.second;
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-
-  if (platform::is_gpu_place(actual_kernel_key.place_) &&
-      platform::is_cpu_place(expected_kernel_key.place_)) {
-    return pool.Get(actual_kernel_key.place_);
-  } else if (platform::is_cpu_place(actual_kernel_key.place_) &&
-             platform::is_gpu_place(expected_kernel_key.place_)) {
-    return pool.Get(expected_kernel_key.place_);
-  } else {
-    PADDLE_THROW(
-        "Currently, model parallelism is only supported between CPU and CUDA");
-  }
-}
-
 void OperatorWithKernel::Run(const Scope& scope,
                              const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
@@ -417,71 +490,43 @@ void OperatorWithKernel::Run(const Scope& scope,
         "There are no kernels which are registered in the %s operator.", type_);
   }
 
-  // check if op[type] have kernel for kernel_key
-  OpKernelMap& kernels = kernels_iter->second;
-
   ExecutionContext ctx(*this, scope, *dev_ctx);
-  auto actual_kernel_key = GetActualKernelType(ctx);
-  auto expected_kernel_key = GetExpectedKernelType(actual_kernel_key);
-  auto kernel_iter = kernels.find(expected_kernel_key);
-
-  if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("The operator %s does not support %s", type_,
-                 expected_kernel_key);
-  }
-
-  if (actual_kernel_key == expected_kernel_key) {
-    PADDLE_ENFORCE_EQ(actual_kernel_key.place_, expected_kernel_key.place_,
-                      "Currently, model parallelism is only supported between "
-                      "CPU and other devices. For example, multi-GPU model "
-                      "parallelism will failed.");
-  } else {
-    auto kernel_pair = std::make_pair(actual_kernel_key, expected_kernel_key);
-    const DataTransformFn* trans_fun =
-        DataTransformFnMap::Instance().GetNullable(kernel_pair);
-    if (trans_fun) {
-      auto input_vars = this->InputVars();
-      // TODO(qijun) filter the input vars that do not need to be transformed
-
-      // filter vars that has been transformed
-      std::vector<std::string> need_trans;
-      for (auto var_name : input_vars) {
-        auto var_name_trans =
-            var_name + framework::KernelTypeToString(expected_kernel_key);
-        if (!scope.FindVar(var_name_trans)) {
-          const_cast<Scope&>(scope).Var(var_name_trans);
-          need_trans.push_back(var_name);
-        }
-      }
-
-      if (!need_trans.empty()) {
-        auto trans_dev_ctx = GetDeviceContext(kernel_pair);
-
-        // Wait for transform starting
-        dev_ctx->Wait();
-
-        for (auto var_name : need_trans) {
-          (*trans_fun)(trans_dev_ctx, kernel_pair, *(scope.FindVar(var_name)),
-                       scope.FindVar(var_name + framework::KernelTypeToString(
-                                                    expected_kernel_key)));
+  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
+
+  Scope& new_scope = scope.NewScope();
+
+  for (auto& var_name_item : this->Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      auto* var = scope.FindVar(var_name);
+      if (var && VarIsTensor(var)) {
+        auto* tensor_in = GetTensorFromVar(var);
+        if (tensor_in->IsInitialized()) {
+          auto kernel_type_for_var = this->GetKernelTypeForVar(
+              var_name_item.first, *tensor_in, expected_kernel_key);
+          if (kernel_type_for_var != expected_kernel_key) {
+            auto out_var_names = OutputVars(true);
+            if (std::find(out_var_names.begin(), out_var_names.end(),
+                          var_name) != out_var_names.end()) {
+              PADDLE_THROW(
+                  "var %s is both input and output, "
+                  "does not support transform",
+                  var_name);
+            }
+            VLOG(3) << "need to do transform for var " << var_name;
+            auto* trans_var = new_scope.Var(var_name);
+            auto* out = DataTransform(expected_kernel_key, kernel_type_for_var,
+                                      *tensor_in);
+            CopyVariableWithTensor(*var, *out, *trans_var);
+          }
         }
-        // Wait for data transform finishing
-        trans_dev_ctx->Wait();
       }
     }
   }
 
-  kernel_iter->second->Compute(ctx);
-}
+  OpKernelMap& kernels = kernels_iter->second;
+  auto kernel_iter = kernels.find(expected_kernel_key);
 
-OpKernelType OperatorWithKernel::GetActualKernelType(
-    const ExecutionContext& ctx) const {
-  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
-}
-
-OpKernelType OperatorWithKernel::GetExpectedKernelType(
-    const OpKernelType& actual_kernel_type) const {
-  return actual_kernel_type;
+  kernel_iter->second->Compute(ExecutionContext(*this, new_scope, *dev_ctx));
 }
 
 proto::DataType OperatorWithKernel::IndicateDataType(
@@ -513,5 +558,16 @@ proto::DataType OperatorWithKernel::IndicateDataType(
   return static_cast<proto::DataType>(data_type);
 }
 
+OpKernelType OperatorWithKernel::GetExpectedKernelType(
+    const ExecutionContext& ctx) const {
+  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
+}
+
+OpKernelType OperatorWithKernel::GetKernelTypeForVar(
+    const std::string& var_name, const Tensor& tensor,
+    const OpKernelType& expected_kernel_type) const {
+  return OpKernelType(expected_kernel_type.data_type_, tensor.place());
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index d0a9b643d565d6651fd7ec0b515f088362852ba3..d5feb598649c97a9517b7c2b1764fd54ff9f8693 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <atomic>
 #include <string>
+#include <tuple>
 #include <unordered_map>
 #include <vector>
 
@@ -52,10 +53,33 @@ constexpr char kGradVarSuffix[] = "@GRAD";
 /// Variables with this suffix are supposed to be filled up with zeros.
 constexpr char kZeroVarSuffix[] = "@ZERO";
 
-// define some kernel hint
-const std::string kUseCPU = "use_cpu";
-const std::string kUseCUDNN = "use_cudnn";
-const std::string kUseMKLDNN = "use_mkldnn";
+// define some kernel priority
+extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
+
+/**
+ * @brief Use cpu kernel only
+ */
+void UseCPU();
+
+/**
+ * @brief Perfer MKLDNN kernel than Plain CPU kernel
+ */
+void UseMKLDNN();
+
+/**
+ * @brief Perfer CUDA kernel than Plain CPU kernel
+ */
+void UseCUDA();
+
+/**
+ * @brief Perfer cudnn kernel than Plain CUDA kernel
+ */
+void UseCUDNN();
+
+/**
+ * @brief Use all available kernels
+ */
+void UseALL();
 
 inline std::string GradVarName(const std::string& var_name) {
   return var_name + kGradVarSuffix;
@@ -84,7 +108,10 @@ class OperatorBase {
     return boost::get<T>(attrs_.at(name));
   }
 
-  virtual std::string DebugString() const;
+  /// if scope is not null, also show dimensions of arguments
+  virtual std::string DebugStringEx(const Scope* scope) const;
+
+  std::string DebugString() const { return DebugStringEx(nullptr); }
 
   /// Net will call this function to Run an op.
   virtual void Run(const Scope& scope, const platform::Place& place) const = 0;
@@ -381,9 +408,10 @@ class OperatorWithKernel : public OperatorBase {
   }
 
  protected:
-  virtual OpKernelType GetActualKernelType(const ExecutionContext& ctx) const;
-  virtual OpKernelType GetExpectedKernelType(
-      const OpKernelType& actual_kernel_type) const;
+  virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
+  virtual OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const OpKernelType& expected_kernel_type) const;
 
  private:
   // indicate kernel DataType by input data. Defaultly all input data must be
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 4d38a7ada91af834aa1a19b49e36d606ebe786ba..d002f3f238862a53ad7286570e2d0bbd2334c584 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -114,7 +114,8 @@ class OpWithKernelTest : public OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {}
-  OpKernelType GetActualKernelType(const ExecutionContext& ctx) const override {
+  OpKernelType GetExpectedKernelType(
+      const ExecutionContext& ctx) const override {
     return OpKernelType(proto::DataType::FP32, ctx.GetPlace());
   }
 };
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index 0c01d605bcd95f5796fba1e5a3351a2640b2898a..2bd0ac8f5a9eb6439a4196dd9c61e13797c1a8e3 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>  // for unique_ptr
 #include <mutex>   // for call_once
 #include "glog/logging.h"
+#include "paddle/framework/threadpool.h"
 #include "paddle/string/printf.h"
 
 namespace paddle {
@@ -87,7 +88,8 @@ void Scope::DeleteScope(Scope* scope) {
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
   this->kids_.erase(it);
-  delete scope;
+  // Make delete async.
+  Async([scope] { delete scope; });
 }
 
 void Scope::Rename(const std::string& origin_name,
@@ -107,6 +109,7 @@ std::string Scope::Rename(const std::string& origin_name) const {
   Rename(origin_name, var_name);
   return var_name;
 }
+
 Variable* Scope::FindVarLocally(const std::string& name) const {
   auto it = vars_.find(name);
   if (it != vars_.end()) return it->second;
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index 10143326dfa201894c777b3e5e226d5ca5015eda..a1da81cc7977d2f31b99c41fb3db3ec03188f954 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -75,9 +75,9 @@ class Scope {
   // Rename variable to a new name and return the new name
   std::string Rename(const std::string& origin_name) const;
 
- private:
   Variable* FindVarLocally(const std::string& name) const;
 
+ private:
   // Call Scope::NewScope for a sub-scope.
   explicit Scope(Scope const* parent) : parent_(parent) {}
 
diff --git a/paddle/framework/selected_rows.cc b/paddle/framework/selected_rows.cc
index 82adfa7123a3cf40d929021602c45fe7d2e34ffa..3b3e60177a495cc99f38ee8b82af41c4c76b8652 100644
--- a/paddle/framework/selected_rows.cc
+++ b/paddle/framework/selected_rows.cc
@@ -37,8 +37,8 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
   SerializeToStream(os, selected_rows.value(), dev_ctx);
 }
 
-void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows) {
-  auto tensor = *selected_rows->mutable_value();
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
+                           const platform::DeviceContext& dev_ctx) {
   {
     // the 1st field, unit32_t version for SelectedRows
     uint32_t version;
@@ -62,7 +62,7 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows) {
     selected_rows->set_height(height);
   }
   // the 4st field, tensor which contains the data
-  DeserializeFromStream(is, &tensor);
+  DeserializeFromStream(is, selected_rows->mutable_value(), dev_ctx);
 }
 
 }  // namespace framework
diff --git a/paddle/framework/selected_rows.h b/paddle/framework/selected_rows.h
index 699e392688e9889f050592172f8bfc45f855d0b1..30d3dfc1e89f073a8180ceacf77619b36f7079a9 100644
--- a/paddle/framework/selected_rows.h
+++ b/paddle/framework/selected_rows.h
@@ -66,7 +66,8 @@ class SelectedRows {
  */
 void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
                        const platform::DeviceContext& dev_ctx);
-void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows);
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
+                           const platform::DeviceContext& dev_ctx);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/selected_rows_test.cc b/paddle/framework/selected_rows_test.cc
index 75487c4010391aa9e519d73058184fa936dabb84..8ff3fb6a97199a2798ab29c56957a0f77fa26628 100644
--- a/paddle/framework/selected_rows_test.cc
+++ b/paddle/framework/selected_rows_test.cc
@@ -51,10 +51,12 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
   SerializeToStream(oss, *selected_rows_, cpu_ctx);
 
   std::istringstream iss(oss.str());
-  DeserializeFromStream(iss, &dst_tensor);
+  DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
 
   ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows());
   ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
+  ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
+  ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
 }
 
 }  // namespace framework
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 341a6949beeb2dfa64b23d2079bd8f48750a94f8..4aaa29d794c95592832a1fe990e2dce274eba9d5 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -55,6 +55,10 @@ class Tensor {
   template <typename T>
   inline const T* data() const;
 
+  inline bool IsInitialized() const;
+
+  inline void switch_place(platform::Place new_place);
+
   /**
    * @brief   Return a pointer to mutable memory block.
    * @note    If not exist, then allocation.
@@ -200,6 +204,15 @@ class Tensor {
   size_t offset_;
 };
 
+inline void Tensor::switch_place(platform::Place new_place) {
+  if (holder_->place() == new_place) {
+    return;
+  }
+
+  // TODO(tonyyang-svail): do memcpy here.
+  PADDLE_THROW("Not Implemented");
+}
+
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 6c6f298edc187a87677089e54c4c9046821282df..1340c5e48520ccdd537e694abf452fd79129df99 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -84,6 +84,8 @@ inline const T* Tensor::data() const {
       reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
 }
 
+inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
+
 template <typename T>
 inline T* Tensor::data() {
   check_memory_size();
diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h
index 6a21f8db1e3966fd23eee0da2346b2d61f9321fb..5ac13cba4dae9c058e2d96da24dab01f44ece772 100644
--- a/paddle/framework/tensor_util.h
+++ b/paddle/framework/tensor_util.h
@@ -270,7 +270,23 @@ inline void SerializeToStream(std::ostream& os, const Tensor& tensor,
   }
 }
 
-inline void DeserializeFromStream(std::istream& is, Tensor* tensor) {
+struct DeserializedDataFunctor {
+  DeserializedDataFunctor(void** buf, Tensor* tensor,
+                          const platform::Place& place)
+      : buf_(buf), tensor_(tensor), place_(place) {}
+
+  template <typename T>
+  void operator()() {
+    *buf_ = tensor_->mutable_data<T>(place_);
+  }
+
+  void** buf_;
+  Tensor* tensor_;
+  platform::Place place_;
+};
+
+inline void DeserializeFromStream(std::istream& is, Tensor* tensor,
+                                  const platform::DeviceContext& dev_ctx) {
   uint32_t version;
   is.read(reinterpret_cast<char*>(&version), sizeof(version));
   PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
@@ -289,27 +305,28 @@ inline void DeserializeFromStream(std::istream& is, Tensor* tensor) {
     dims.reserve(static_cast<size_t>(desc.dims().size()));
     std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
     tensor->Resize(framework::make_ddim(dims));
-
     void* buf;
-    platform::Place cpu = platform::CPUPlace();
-    // TODO(Yancey1989): use VisiterDataType instead of DataType switch
-    switch (desc.data_type()) {
-      case proto::FP32:
-        buf = tensor->mutable_data<float>(cpu);
-        break;
-      case proto::FP64:
-        buf = tensor->mutable_data<double>(cpu);
-        break;
-      case proto::INT32:
-        buf = tensor->mutable_data<int>(cpu);
-        break;
-      case proto::INT64:
-        buf = tensor->mutable_data<int64_t>(cpu);
-        break;
-      default:
-        PADDLE_THROW("DataType %d not supported", desc.data_type());
+    auto ctx = platform::CPUDeviceContext();
+    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+      Tensor cpu_tensor;
+      cpu_tensor.Resize(framework::make_ddim(dims));
+      framework::VisitDataType(
+          desc.data_type(),
+          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
+      is.read(static_cast<char*>(buf), cpu_tensor.memory_size());
+      auto cpu_place = new platform::CPUPlace();
+      framework::CopyFrom(cpu_tensor, *cpu_place, dev_ctx, tensor);
+      delete cpu_place;
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      framework::VisitDataType(
+          desc.data_type(),
+          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
+      is.read(static_cast<char*>(buf), tensor->memory_size());
     }
-    is.read(static_cast<char*>(buf), tensor->memory_size());
   }
 }
 
diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc
index 0dc5166fcabf77b48b8681ab1f050e2bc88f44ab..15cd2bd09c4a34bc7a5bb8645762a3e0aaefd713 100644
--- a/paddle/framework/tensor_util_test.cc
+++ b/paddle/framework/tensor_util_test.cc
@@ -270,11 +270,12 @@ TEST(Tensor, SerializeAndDeserialize) {
     SerializeToStream(oss, src_tensor, cpu_ctx);
 
     std::istringstream iss(oss.str());
-    DeserializeFromStream(iss, &dst_tensor);
+    DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
     int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
     for (int i = 0; i < 5; ++i) {
       ASSERT_EQ(dst_ptr[i], array[i]);
     }
+    ASSERT_EQ(dst_tensor.dims(), src_tensor.dims());
     delete place;
   }
 #ifdef PADDLE_WITH_CUDA
@@ -292,13 +293,12 @@ TEST(Tensor, SerializeAndDeserialize) {
     SerializeToStream(oss, gpu_tensor, gpu_ctx);
 
     std::istringstream iss(oss.str());
-    DeserializeFromStream(iss, &dst_tensor);
+    DeserializeFromStream(iss, &dst_tensor, gpu_ctx);
 
     int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
     for (int i = 0; i < 6; ++i) {
       ASSERT_EQ(dst_ptr[i], array[i]);
     }
-
     delete gpu_place;
   }
 #endif
diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h
index bcd8190755083ec30687675602a1c95a9c15c69e..3ac345851c38557f82698786dd3bc8e1202a4256 100644
--- a/paddle/framework/threadpool.h
+++ b/paddle/framework/threadpool.h
@@ -29,7 +29,6 @@ namespace framework {
 class ThreadPool {
  public:
   typedef std::packaged_task<void()> Task;
-  typedef std::function<void()> Fun;
 
   /**
    * @brief   Get a instance of threadpool, the thread number will
@@ -67,7 +66,8 @@ class ThreadPool {
    * @return    std::future<void>, we could wait for the task finished by
    *            f.wait().
    */
-  std::future<void> Run(const Fun& fn) {
+  template <typename Callback>
+  std::future<void> Run(Callback fn) {
     std::unique_lock<std::mutex> lock(mutex_);
     Task task(std::bind(fn));
     std::future<void> f = task.get_future();
@@ -159,5 +159,13 @@ class ThreadPool {
   std::condition_variable completed_;
 };
 
+// Run a function asynchronously.
+// NOTE: The function must return void. If the function need to return a value,
+// you can use lambda to capture a value pointer.
+template <typename Callback>
+std::future<void> Async(Callback callback) {
+  return ThreadPool::GetInstance()->Run(callback);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index 7d002b9ea0b597730685ee03b021c4982f787f49..aeab18d7214f8d9dd79bc3d2e0322490445b3b49 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -52,7 +52,7 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
   }
 }
 
-int32_t VarDesc::GetLodLevel() const {
+int32_t VarDesc::GetLoDLevel() const {
   switch (desc_.type()) {
     case proto::VarDesc::LOD_TENSOR:
       return desc_.lod_tensor().lod_level();
diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h
index 4fd2abe7fb215c3ac454de3e30754685111eb570..fc482c467404a6b9dfed64c43871d91d3d10c766 100644
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -76,7 +76,7 @@ class VarDesc {
 
   void SetLoDLevel(int32_t lod_level);
 
-  int32_t GetLodLevel() const;
+  int32_t GetLoDLevel() const;
 
   proto::VarDesc::VarType GetType() const;
 
diff --git a/paddle/framework/var_type.h b/paddle/framework/var_type.h
index 0e6ea8dc69fe9b7fdaa1163b8d63295624cd3abc..5b7a08a08732a6ccbc206f6a4f0aa4788ce4a219 100644
--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
@@ -17,6 +17,8 @@ limitations under the License. */
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/framework/variable.h"
 
 namespace paddle {
 namespace framework {
@@ -35,7 +37,7 @@ inline proto::VarDesc::VarType ToVarType(std::type_index type) {
 }
 
 template <typename Visitor>
-inline void VisitVarType(const Variable& var, Visitor visitor) {
+inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
   switch (ToVarType(var.Type())) {
     case proto::VarDesc_VarType_LOD_TENSOR:
       visitor(var.Get<framework::LoDTensor>());
diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h
index e5a94759f9230ab4ce9d2cc24849a2debb8a5e2f..36b76fb196cfd4c7b3697dcf0cda9a23ff53deb3 100644
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
@@ -32,6 +32,8 @@ class Variable {
     return *static_cast<const T*>(holder_->Ptr());
   }
 
+  bool IsInitialized() const { return holder_ != nullptr; }
+
   template <typename T>
   T* GetMutable() {
     if (!IsType<T>()) {
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index a2f07937b8834e3f3fa7a6bf2ae10f29a8d84f29..ba83667ebc9a89c37f77a7f71e6df90b54723cc0 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1472,7 +1472,8 @@ TEST(Layer, RecurrentLayer) {
     for (auto reversed : {false, true}) {
       config.layerConfig.set_reversed(reversed);
       config.testState = !reversed;
-      testLayerGrad(config, "recurrent", 50, /* trans= */ false, useGpu);
+      testLayerGrad(
+          config, "recurrent", 50, /* trans= */ false, useGpu, false, 1.0);
     }
   }
 }
@@ -1494,7 +1495,8 @@ TEST(Layer, LstmLayer) {
     for (auto reversed : {false, true}) {
       config.layerConfig.set_reversed(reversed);
       config.testState = !reversed;
-      testLayerGrad(config, "lstmemory", 100, /* trans= */ false, useGpu);
+      testLayerGrad(
+          config, "lstmemory", 100, /* trans= */ false, useGpu, false, 0.02);
     }
   }
   for (auto useGpu : {true}) {
diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8437b2b21942ead544dab8636db1b355b7cf7bd5
--- /dev/null
+++ b/paddle/inference/CMakeLists.txt
@@ -0,0 +1,47 @@
+set(FLUID_CORE_MODULES
+    backward proto_desc paddle_memory executor prune init ${GLOB_OP_LIB})
+
+cc_library(paddle_fluid_api
+    SRCS inference.cc
+    DEPS ${FLUID_CORE_MODULES})
+
+# Merge all modules into a simgle static library
+cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES})
+
+# ptools
+# just for testing, we may need to change the storing format for inference_model
+# and move the dependent of pickle.
+# download from http://www.picklingtools.com/
+# build in the C++ sub-directory, using command
+#     make -f Makefile.Linux libptools.so
+set(PTOOLS_LIB)
+set(PTOOLS_ROOT $ENV{PTOOLS_ROOT} CACHE PATH "Folder contains PicklingTools")
+find_path(PTOOLS_INC_DIR chooseser.h PATHS ${PTOOLS_ROOT}/C++)
+find_library(PTOOLS_SHARED_LIB NAMES ptools PATHS ${PTOOLS_ROOT}/C++)
+if(PTOOLS_INC_DIR AND PTOOLS_SHARED_LIB)
+  add_definitions(-DPADDLE_USE_PTOOLS)
+  set(PTOOLS_LIB ptools)
+  message(STATUS "Found PicklingTools: ${PTOOLS_SHARED_LIB}")
+  add_library(${PTOOLS_LIB} SHARED IMPORTED GLOBAL)
+  set_property(TARGET ${PTOOLS_LIB} PROPERTY IMPORTED_LOCATION ${PTOOLS_SHARED_LIB})
+  include_directories(${PTOOLS_ROOT}/C++)
+  include_directories(${PTOOLS_ROOT}/C++/opencontainers_1_8_5/include)
+  add_definitions(-DOC_NEW_STYLE_INCLUDES) # used in ptools
+endif()
+
+add_executable(example example.cc)
+if(APPLE)
+  set(OPTIONAL_LINK_FLAGS)
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
+    set(OPTIONAL_LINK_FLAGS "-undefined dynamic_lookup")
+  endif()
+  target_link_libraries(example
+      -Wl,-force_load paddle_fluid
+      ${OPTIONAL_LINK_FLAGS}
+      ${PTOOLS_LIB})
+else()
+  target_link_libraries(example
+      -Wl,--start-group -Wl,--whole-archive paddle_fluid
+      -Wl,--no-whole-archive -Wl,--end-group
+      ${PTOOLS_LIB})
+endif()
diff --git a/paddle/inference/example.cc b/paddle/inference/example.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9711b20e6fb4099a2cc497029468ebd1fd0b3456
--- /dev/null
+++ b/paddle/inference/example.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <time.h>
+#include <iostream>
+#include "gflags/gflags.h"
+#include "paddle/inference/inference.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_string(feed_var_names, "", "Names of feeding variables");
+DEFINE_string(fetch_var_names, "", "Names of fetching variables");
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_dirname.empty() || FLAGS_feed_var_names.empty() ||
+      FLAGS_fetch_var_names.empty()) {
+    // Example:
+    //   ./example --dirname=recognize_digits_mlp.inference.model
+    //             --feed_var_names="x"
+    //             --fetch_var_names="fc_2.tmp_2"
+    std::cout << "Usage: ./example --dirname=path/to/your/model "
+                 "--feed_var_names=x --fetch_var_names=y"
+              << std::endl;
+    exit(1);
+  }
+
+  std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::cout << "FLAGS_feed_var_names: " << FLAGS_feed_var_names << std::endl;
+  std::cout << "FLAGS_fetch_var_names: " << FLAGS_fetch_var_names << std::endl;
+
+  std::string dirname = FLAGS_dirname;
+  std::vector<std::string> feed_var_names = {FLAGS_feed_var_names};
+  std::vector<std::string> fetch_var_names = {FLAGS_fetch_var_names};
+
+  paddle::InferenceEngine* engine = new paddle::InferenceEngine();
+  engine->LoadInferenceModel(dirname, feed_var_names, fetch_var_names);
+
+  paddle::framework::LoDTensor input;
+  srand(time(0));
+  float* input_ptr =
+      input.mutable_data<float>({1, 784}, paddle::platform::CPUPlace());
+  for (int i = 0; i < 784; ++i) {
+    input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
+  }
+
+  std::vector<paddle::framework::LoDTensor> feeds;
+  feeds.push_back(input);
+  std::vector<paddle::framework::LoDTensor> fetchs;
+  engine->Execute(feeds, fetchs);
+
+  for (size_t i = 0; i < fetchs.size(); ++i) {
+    auto dims_i = fetchs[i].dims();
+    std::cout << "dims_i:";
+    for (int j = 0; j < dims_i.size(); ++j) {
+      std::cout << " " << dims_i[j];
+    }
+    std::cout << std::endl;
+    std::cout << "result:";
+    float* output_ptr = fetchs[i].data<float>();
+    for (int j = 0; j < paddle::framework::product(dims_i); ++j) {
+      std::cout << " " << output_ptr[j];
+    }
+    std::cout << std::endl;
+  }
+
+  delete engine;
+  return 0;
+}
diff --git a/paddle/inference/inference.cc b/paddle/inference/inference.cc
new file mode 100644
index 0000000000000000000000000000000000000000..49e39358e81bbee64a618be88ee0fca6aa438b93
--- /dev/null
+++ b/paddle/inference/inference.cc
@@ -0,0 +1,195 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "inference.h"
+#include <fstream>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/feed_fetch_method.h"
+#include "paddle/framework/init.h"
+#include "paddle/framework/scope.h"
+
+#ifdef PADDLE_USE_PTOOLS
+#include "chooseser.h"
+#endif
+
+namespace paddle {
+
+void InferenceEngine::LoadInferenceModel(
+    const std::string& dirname,
+    const std::vector<std::string>& feed_var_names,
+    const std::vector<std::string>& fetch_var_names) {
+#ifdef PADDLE_USE_PTOOLS
+  std::string model_filename = dirname + "/__model__";
+  LOG(INFO) << "Using PicklingTools, loading model from " << model_filename;
+  Val v;
+  LoadValFromFile(model_filename.c_str(), v, SERIALIZE_P0);
+  std::string program_desc_str = v["program_desc_str"];
+  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
+// PicklingTools cannot parse the vector of strings correctly.
+#else
+  std::string model_filename = dirname + "/__model__.dat";
+  LOG(INFO) << "loading model from " << model_filename;
+  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
+  std::string program_desc_str;
+  inputfs.seekg(0, std::ios::end);
+  program_desc_str.resize(inputfs.tellg());
+  inputfs.seekg(0, std::ios::beg);
+  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
+  inputfs.read(&program_desc_str[0], program_desc_str.size());
+  inputfs.close();
+#endif
+  program_ = new framework::ProgramDesc(program_desc_str);
+  GenerateLoadProgram(dirname);
+
+  if (feed_var_names.empty() || fetch_var_names.empty()) {
+    LOG(FATAL) << "Please specify the feed_var_names and fetch_var_names.";
+  }
+  feed_var_names_ = feed_var_names;
+  fetch_var_names_ = fetch_var_names;
+  PrependFeedOp();
+  AppendFetchOp();
+}
+
+bool InferenceEngine::IsParameter(const framework::VarDesc* var) {
+  if (var->Persistable()) {
+    // There are many unreachable variables in the program
+    for (size_t i = 0; i < program_->Size(); ++i) {
+      const framework::BlockDesc& block = program_->Block(i);
+      for (auto* op : block.AllOps()) {
+        for (auto input_argument_name : op->InputArgumentNames()) {
+          if (input_argument_name == var->Name()) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+void InferenceEngine::GenerateLoadProgram(const std::string& dirname) {
+  framework::BlockDesc* global_block = program_->MutableBlock(0);
+
+  load_program_ = new framework::ProgramDesc();
+  framework::BlockDesc* load_block = load_program_->MutableBlock(0);
+  for (auto* var : global_block->AllVars()) {
+    if (IsParameter(var)) {
+      LOG(INFO) << "parameter's name: " << var->Name();
+
+      framework::VarDesc* new_var = load_block->Var(var->Name());
+      new_var->SetShape(var->Shape());
+      new_var->SetDataType(var->GetDataType());
+      new_var->SetType(var->GetType());
+      new_var->SetLoDLevel(var->GetLoDLevel());
+      new_var->SetPersistable(true);
+
+      // append_op
+      framework::OpDesc* op = load_block->AppendOp();
+      op->SetType("load");
+      op->SetOutput("Out", {new_var->Name()});
+      op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
+      op->CheckAttrs();
+    }
+  }
+}
+
+void InferenceEngine::PrependFeedOp() {
+  if (!program_) {
+    LOG(FATAL) << "Please initialize the program_ first.";
+  }
+
+  framework::BlockDesc* global_block = program_->MutableBlock(0);
+
+  // create_var
+  framework::VarDesc* feed_var = global_block->Var("feed");
+  feed_var->SetType(framework::proto::VarDesc::FEED_MINIBATCH);
+  feed_var->SetPersistable(true);
+
+  // prepend feed_op
+  for (size_t i = 0; i < feed_var_names_.size(); ++i) {
+    std::string var_name = feed_var_names_[i];
+    LOG(INFO) << "feed var's name: " << var_name;
+
+    // prepend_op
+    framework::OpDesc* op = global_block->PrependOp();
+    op->SetType("feed");
+    op->SetInput("X", {"feed"});
+    op->SetOutput("Out", {var_name});
+    op->SetAttr("col", {static_cast<int>(i)});
+    op->CheckAttrs();
+  }
+}
+
+void InferenceEngine::AppendFetchOp() {
+  if (!program_) {
+    LOG(FATAL) << "Please initialize the program_ first.";
+  }
+
+  framework::BlockDesc* global_block = program_->MutableBlock(0);
+
+  // create_var
+  framework::VarDesc* fetch_var = global_block->Var("fetch");
+  fetch_var->SetType(framework::proto::VarDesc::FETCH_LIST);
+  fetch_var->SetPersistable(true);
+
+  // append fetch_op
+  for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
+    std::string var_name = fetch_var_names_[i];
+    LOG(INFO) << "fetch var's name: " << var_name;
+
+    // append_op
+    framework::OpDesc* op = global_block->AppendOp();
+    op->SetType("fetch");
+    op->SetInput("X", {var_name});
+    op->SetOutput("Out", {"fetch"});
+    op->SetAttr("col", {static_cast<int>(i)});
+    op->CheckAttrs();
+  }
+}
+
+void InferenceEngine::Execute(const std::vector<framework::LoDTensor>& feeds,
+                              std::vector<framework::LoDTensor>& fetchs) {
+  if (!program_ || !load_program_) {
+    LOG(FATAL) << "Please initialize the program_ and load_program_ first.";
+  }
+
+  if (feeds.size() < feed_var_names_.size()) {
+    LOG(FATAL) << "Please feed " << feed_var_names_.size() << " input Tensors.";
+  }
+
+  auto* place = new platform::CPUPlace();
+  framework::InitDevices({"CPU"});
+  framework::Executor* executor = new framework::Executor(*place);
+  framework::Scope* scope = new framework::Scope();
+
+  executor->Run(*load_program_, scope, 0, true, true);
+
+  // set_feed_variable
+  for (size_t i = 0; i < feed_var_names_.size(); ++i) {
+    framework::SetFeedVariable(scope, feeds[i], "feed", i);
+  }
+
+  executor->Run(*program_, scope, 0, true, true);
+
+  // get_fetch_variable
+  fetchs.resize(fetch_var_names_.size());
+  for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
+    fetchs[i] = framework::GetFetchVariable(*scope, "fetch", i);
+  }
+
+  delete place;
+  delete scope;
+  delete executor;
+}
+}  // namespace paddle
diff --git a/paddle/inference/inference.h b/paddle/inference/inference.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3f3ef4b440036a0b27353cc092eed1bbf96eeb3
--- /dev/null
+++ b/paddle/inference/inference.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/program_desc.h"
+
+namespace paddle {
+
+class InferenceEngine {
+public:
+  InferenceEngine() : program_(nullptr), load_program_(nullptr) {}
+  ~InferenceEngine() {
+    delete program_;
+    delete load_program_;
+  }
+
+  void LoadInferenceModel(const std::string& dirname,
+                          const std::vector<std::string>& feed_var_names,
+                          const std::vector<std::string>& fetch_var_names);
+  void Execute(const std::vector<framework::LoDTensor>& feeds,
+               std::vector<framework::LoDTensor>& fetchs);
+
+private:
+  bool IsParameter(const framework::VarDesc* var);
+  void GenerateLoadProgram(const std::string& dirname);
+  void PrependFeedOp();
+  void AppendFetchOp();
+
+private:
+  framework::ProgramDesc* program_;
+  framework::ProgramDesc* load_program_;
+  std::vector<std::string> feed_var_names_;
+  std::vector<std::string> fetch_var_names_;
+};
+
+}  // namespace paddle
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 9f603474de2f845822651c707174a8804ecf5aad..f1ce52332327ed2a9f290ccf412199fd5a6bbb67 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -61,106 +61,28 @@ function(op_library TARGET)
                 ${op_common_deps})
     endif()
 
-    # net_op doesn't need pybind
-    if ("${TARGET}" STREQUAL "net_op")
-        set(pybind_flag 1)
-    endif()
-
-    if ("${TARGET}" STREQUAL "compare_op")
-        set(pybind_flag 1)
-        file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
-    endif()
-
-    # conv_op contains several operators
-    if ("${TARGET}" STREQUAL "conv_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
-    endif()
-
-    # conv_cudnn_op contains several operators
-    if ("${TARGET}" STREQUAL "conv_cudnn_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d_cudnn);\n")
-    endif()
-
-    # pool_op contains several operators
-    if ("${TARGET}" STREQUAL "pool_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
-    endif()
-
-    # pool_cudnn_op contains several operators
-    if ("${TARGET}" STREQUAL "pool_cudnn_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
-    endif()
-
-    if ("${TARGET}" STREQUAL "logical_op")
-        set(pybind_flag 1)
-        file(APPEND ${pybind_file} "USE_OP(logical_and);\n")
-    endif()
-
-    # pool_with_index_op contains several operators
-    if ("${TARGET}" STREQUAL "pool_with_index_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
-    endif()
-
-    # conv_transpose_op contains several operators
-    if ("${TARGET}" STREQUAL "conv_transpose_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
-    endif()
-
-    # conv_transpose_cudnn_op contains two operators
-    if ("${TARGET}" STREQUAL "conv_transpose_cudnn_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose_cudnn);\n")
-    endif()
-
-    # save_restore_op contains several operators
-    if ("${TARGET}" STREQUAL "save_restore_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(save);\n")
-    endif()
-
-    # activation_op contains several operators
-    if ("${TARGET}" STREQUAL "activation_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(sigmoid);\n")
-    endif()
-
-    # nccl_op contains several operators
-    if ("${TARGET}" STREQUAL "nccl_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
-    endif()
-
-    # reduce_op contains several operators
-    if ("${TARGET}" STREQUAL "reduce_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
-    endif()
+    # Define operators that don't need pybind here.
+    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
+            set(pybind_flag 1)
+        endif()
+    endforeach()
 
-    if ("${TARGET}" STREQUAL "tensor_array_read_write_op")
-        set(pybind_flag 1)
-        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(write_to_array);\n")
+    # The registration of USE_OP, please refer to paddle/framework/op_registry.h.
+    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
+    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
+    file(READ ${TARGET}.cc TARGET_CONTENT)
+    string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
+    string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
+    if (one_register STREQUAL "")
+        string(REPLACE "_op" "" TARGET "${TARGET}")
+    else ()
+        string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}")
+        string(REPLACE "," "" TARGET "${TARGET}")
     endif()
 
     # pybind USE_NO_KERNEL_OP
     # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
-    file(READ ${TARGET}.cc TARGET_CONTENT)
     string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
     string(REPLACE "_op" "" TARGET "${TARGET}")
     if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
@@ -171,7 +93,6 @@ function(op_library TARGET)
     # pybind USE_CPU_ONLY_OP
     list(LENGTH cu_srcs cu_srcs_len)
     list(LENGTH cu_cc_srcs cu_cc_srcs_len)
-
     if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
         file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
         set(pybind_flag 1)
@@ -188,6 +109,7 @@ add_subdirectory(nccl)
 
 if(WITH_GPU)
     op_library(nccl_op DEPS nccl_common)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
 else()
     set(DEPS_OPS ${DEPS_OPS} nccl_op)
 endif()
@@ -230,6 +152,7 @@ op_library(conv_transpose_op DEPS vol2col)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
 op_library(cos_sim_op DEPS cos_sim_functor)
+op_library(parallel_do_op DEPS executor)
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
@@ -238,6 +161,8 @@ list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
     op_library(${src})
 endforeach()
+file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index d7baa6e90538484b400f32587a052d394a8d10d5..8e8a3c7dd3036317fac29b709d7a29e18f017503 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -53,7 +53,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 0885f7c570b9b52dc51597347295734fd689da8d..88c3d1c597a853abdee7753a5110be4a1726e905 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
 
 namespace paddle {
 namespace operators {
@@ -26,12 +27,16 @@ class ActivationKernel
   using T = typename Functor::ELEMENT_TYPE;
 
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Out = context.Output<framework::Tensor>("Out");
-    Out->mutable_data<T>(context.GetPlace());
-
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto& X = detail::Ref(context.Input<framework::Tensor>("X"),
+                          "Cannot get input tensor X, variable name = %s",
+                          context.op().Input("X"));
+
+    auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"),
+                            "Cannot get output tensor Out, variable name = %s",
+                            context.op().Output("Out"));
+    Out.mutable_data<T>(context.GetPlace());
+    auto x = framework::EigenVector<T>::Flatten(X);
+    auto out = framework::EigenVector<T>::Flatten(Out);
     auto* place =
         context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
diff --git a/paddle/operators/adagrad_op.h b/paddle/operators/adagrad_op.h
index 0d77dbcbacd4efb6c1900e57b5c4ea9e9b136771..66f5b0f449a4f11a3c734c98a6a97833763348a1 100644
--- a/paddle/operators/adagrad_op.h
+++ b/paddle/operators/adagrad_op.h
@@ -47,8 +47,7 @@ class AdagradOpKernel : public framework::OpKernel<T> {
           *ctx.Input<framework::Tensor>("Grad"));
       auto moment = framework::EigenVector<T>::Flatten(
           *ctx.Input<framework::Tensor>("Moment"));
-      auto lr = framework::EigenVector<T>::Flatten(
-          *ctx.Input<framework::Tensor>("LearningRate"));
+      auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
 
       auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
       auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
@@ -56,8 +55,16 @@ class AdagradOpKernel : public framework::OpKernel<T> {
 
       moment_out.device(*place) = moment + grad * grad;
       Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-      param_out.device(*place) =
-          param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+      if (platform::is_cpu_place(ctx.GetPlace())) {
+        auto* lr = learning_rate->data<T>();
+        param_out.device(*place) =
+            param - lr[0] * grad / (moment_out.sqrt() + epsilon);
+      } else {
+        auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
+        param_out.device(*place) =
+            param -
+            lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+      }
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto* param_tensor = ctx.Input<framework::Tensor>("Param");
       PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor);
diff --git a/paddle/operators/auc_op.cc b/paddle/operators/auc_op.cc
index c16bc11931e6733d567107913521eafc34a30066..b6494f95097bdc87081950815e910beda5d6850d 100644
--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
@@ -39,7 +39,7 @@ class AucOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index 98db28ddee7c6cdb37fe7732649d4fc38de7b873..0e984c38ba78bddc232ce43bd0982408e837abe3 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -64,7 +64,7 @@ class BatchNormOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
                    "Input X must have 2 to 5 dimensions.");
 
-    const int C =
+    const int64_t C =
         (data_layout == DataLayout::kNCHW ? x_dims[1]
                                           : x_dims[x_dims.size() - 1]);
 
@@ -78,6 +78,7 @@ class BatchNormOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("VarianceOut", {C});
     ctx->SetOutputDim("SavedMean", {C});
     ctx->SetOutputDim("SavedVariance", {C});
+    ctx->ShareLoD("X", "Y");
   }
 };
 
@@ -305,7 +306,7 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     const auto *var = ctx.InputVar(framework::GradVarName("Y"));
     if (var == nullptr) {
diff --git a/paddle/operators/chunk_eval_op.cc b/paddle/operators/chunk_eval_op.cc
index a040404266c3cd44230b141cfed1aaede3f05187..44f667aead9ac88fee57310e06e3192732a8d908 100644
--- a/paddle/operators/chunk_eval_op.cc
+++ b/paddle/operators/chunk_eval_op.cc
@@ -55,10 +55,10 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(framework::proto::DataType::FP32,
-                                   ctx.device_context());
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/operators/chunk_eval_op.h b/paddle/operators/chunk_eval_op.h
index 74ab435c860b22b2ee3f485743540976a7a31b96..300aff90c0af666a95b7d4a0329de709e48ceddb 100644
--- a/paddle/operators/chunk_eval_op.h
+++ b/paddle/operators/chunk_eval_op.h
@@ -145,6 +145,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
         context.Attr<std::vector<int>>("excluded_chunk_types").end());
 
     auto* inference = context.Input<LoDTensor>("Inference");
+    auto place = inference->place();
     auto* label = context.Input<LoDTensor>("Label");
     auto* precision = context.Output<Tensor>("Precision");
     auto* recall = context.Output<Tensor>("Recall");
@@ -155,15 +156,15 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
 
     const int64_t* inference_data = inference->data<int64_t>();
     const int64_t* label_data = label->data<int64_t>();
-    T* precision_data = precision->mutable_data<T>(context.GetPlace());
-    T* racall_data = recall->mutable_data<T>(context.GetPlace());
-    T* f1_data = f1->mutable_data<T>(context.GetPlace());
+    T* precision_data = precision->mutable_data<T>(place);
+    T* racall_data = recall->mutable_data<T>(place);
+    T* f1_data = f1->mutable_data<T>(place);
     int64_t* num_infer_chunks_data =
-        num_infer_chunks->mutable_data<int64_t>(context.GetPlace());
+        num_infer_chunks->mutable_data<int64_t>(place);
     int64_t* num_label_chunks_data =
-        num_label_chunks->mutable_data<int64_t>(context.GetPlace());
+        num_label_chunks->mutable_data<int64_t>(place);
     int64_t* num_correct_chunks_data =
-        num_correct_chunks->mutable_data<int64_t>(context.GetPlace());
+        num_correct_chunks->mutable_data<int64_t>(place);
     *num_infer_chunks_data = 0;
     *num_label_chunks_data = 0;
     *num_correct_chunks_data = 0;
diff --git a/paddle/operators/compare_op.cc b/paddle/operators/compare_op.cc
index 44665b7872acab8178552e5504916408cf566d13..daa2c193b48fe216ff284169a3dce1b4cd40a791 100644
--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@@ -66,9 +66,9 @@ class CompareOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = OperatorWithKernel::GetActualKernelType(ctx);
+    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
     // CompareOp kernel's device type is decided by input tensor place
     kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
     return kt;
diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc
index 0aa7dd48cafc3e2387ac902882d84ce9029cfcd0..0c5ed3e4e80304c6fd174975166804347feb18b1 100644
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -315,10 +315,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_KERNEL(conv2d, CUDNN, paddle::platform::CUDAPlace,
-                   paddle::operators::CudnnConvOpKernel<float>,
-                   paddle::operators::CudnnConvOpKernel<double>);
-
+// TODO(dzhwinter) : below register should be removed
 REGISTER_OP_CUDA_KERNEL(conv2d_cudnn,
                         paddle::operators::CudnnConvOpKernel<float>,
                         paddle::operators::CudnnConvOpKernel<double>);
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index e65a5dce52c3c51d3d6bee1684c1e97230203d38..ad84524e1785e3b6b4586c83001852b6dba7afe8 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -44,14 +44,12 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
       paddings.size(), strides.size(),
       "Conv paddings dimension and Conv strides dimension should be the same.");
 
-  int input_channels = in_dims[1];
-  PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
+  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups,
                     "The number of input channels should be equal to filter "
                     "channels * groups.");
 
-  int output_channels = filter_dims[0];
   PADDLE_ENFORCE_EQ(
-      output_channels % groups, 0,
+      filter_dims[0] % groups, 0,
       "The number of output channels should be divided by groups.");
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
@@ -66,6 +64,7 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
                                       dilations[i], paddings[i], strides[i]));
   }
   ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
+  ctx->ShareLoD("Input", "Output");
 }
 
 Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc
index 024e1d061a5b2eabc27110d5379cb8226a104079..30626028c137a4ac3acf67f37ba5c5bb75215b3a 100644
--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
@@ -120,17 +120,11 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
-        ctx.device_context());
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::OpKernelType& actual_kernel_type) const override {
-    return framework::OpKernelType(actual_kernel_type.data_type_,
-                                   platform::CPUPlace());
+        platform::CPUPlace());
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/crf_decoding_op.h b/paddle/operators/crf_decoding_op.h
index f6827b7b1128251b2bb7e0a6a032389e5adc1371..ce2f4e6622c21e1e5383b4a3aefc2987bf155aec 100644
--- a/paddle/operators/crf_decoding_op.h
+++ b/paddle/operators/crf_decoding_op.h
@@ -28,9 +28,6 @@ template <typename DeviceContext, typename T>
 class CRFDecodingOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "The crf_decoding operator can only run on CPU.");
-
     auto* emission_weights = ctx.Input<LoDTensor>("Emission");
     auto* transition_weights = ctx.Input<Tensor>("Transition");
     auto* label = ctx.Input<LoDTensor>("Label");
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index fe39cb481aa63fa401603d97778b337282511ab5..7abd5b1c61d610f4f723b13ad6ce61791b96d06d 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -51,7 +51,7 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of computation kernel of cross_entropy
   // is determined by its input "X".
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
@@ -101,7 +101,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of computation kernel of cross_entropy
   // is determined by its input "X".
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
diff --git a/paddle/operators/detail/recv_impl.cc b/paddle/operators/detail/recv_impl.cc
index b746f9df4640bac7b5c4e83091f2c2ab67e778be..319404e56a5f3c407f313991240bbbb85fd39a2a 100644
--- a/paddle/operators/detail/recv_impl.cc
+++ b/paddle/operators/detail/recv_impl.cc
@@ -21,14 +21,9 @@ namespace detail {
 Status SendRecvServerImpl::SendVariable(ServerContext *context,
                                         const VariableMessage *in_var,
                                         VoidMessage *out_var) {
-  // TODO(typhoonzero): support different variable types.
-  std::istringstream iss(in_var->serialized());
-  framework::LoDTensor t;
-  framework::DeserializeFromStream(iss, &t);
-  TensorWithName tensor_with_name =
-      std::make_pair(in_var->varname(), std::move(t));
-
-  var_recv_queue_.Push(std::move(tensor_with_name));
+  MessageWithName msg_with_name =
+      std::make_pair(in_var->varname(), std::move(*in_var));
+  var_recv_queue_.Push(std::move(msg_with_name));
   return Status::OK;
 }
 
@@ -37,14 +32,8 @@ Status SendRecvServerImpl::GetVariable(ServerContext *context,
                                        VariableMessage *out_var) {
   std::string get_var_name = in_var->varname();
   auto *var = scope_->FindVar(get_var_name);
-  auto tensor = var->Get<framework::LoDTensor>();
-  std::ostringstream oss;
-  framework::SerializeToStream(oss, tensor, platform::CPUDeviceContext());
 
-  std::string *varname = out_var->mutable_varname();
-  *varname = get_var_name;
-  std::string *serialized = out_var->mutable_serialized();
-  *serialized = oss.str();
+  SerializeToMessage(get_var_name, var, platform::CPUDeviceContext(), out_var);
   return Status::OK;
 }
 
diff --git a/paddle/operators/detail/send_impl.cc b/paddle/operators/detail/send_impl.cc
index a812fcf39bc19a6e06dc594c13076696e7949372..ae85cf2cec2cd8e046c0c7fd3408f2212f225819 100644
--- a/paddle/operators/detail/send_impl.cc
+++ b/paddle/operators/detail/send_impl.cc
@@ -27,14 +27,8 @@ bool RPCClient::SendVariable(const framework::Scope& scope,
   auto ctx = platform::CPUDeviceContext();
   auto* var = scope.FindVar(inname);
   PADDLE_ENFORCE(var);
-  // TODO(typhoonzero): support SelectedRows
-  PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
-                 "Only support LoDTensor, %s has wrong type", inname);
-  const framework::LoDTensor& tensor = var->Get<framework::LoDTensor>();
-  std::ostringstream oss;
-  framework::SerializeToStream(oss, tensor, ctx);
-  msg.set_varname(inname);
-  msg.set_serialized(oss.str());
+  SerializeToMessage(inname, var, ctx, &msg);
+
   Status status = stub_->SendVariable(&context, msg, &out_msg);
   if (!status.ok()) {
     LOG(ERROR) << "gRPC error: " << status.error_message();
@@ -50,19 +44,15 @@ bool RPCClient::GetVariable(const framework::Scope& scope,
   call_msg.set_varname(outname);
   auto ctx = platform::CPUDeviceContext();
   Status status = stub_->GetVariable(&context, call_msg, &ret_msg);
+  auto* outvar = scope.FindVar(outname);
   if (!status.ok()) {
     LOG(ERROR) << "gRPC error: " << status.error_message();
     return false;
   }
 
   std::istringstream iss(ret_msg.serialized());
+  DeserializeFromMessage(ret_msg, ctx, outvar);
 
-  framework::LoDTensor ret_tensor;
-  framework::DeserializeFromStream(iss, &ret_tensor);
-  auto* outvar = scope.FindVar(outname);
-  framework::LoDTensor* out_tensor = outvar->GetMutable<framework::LoDTensor>();
-  // FIXME(typhoonzero): do not copy.
-  framework::CopyFrom(ret_tensor, ctx.GetPlace(), ctx, out_tensor);
   return true;
 }
 
diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto
index 95c8e708986eac2a27ece70864efd7eac5f84ce8..f141c755ce14ef540aeab32c11c289179aff3f8c 100644
--- a/paddle/operators/detail/send_recv.proto
+++ b/paddle/operators/detail/send_recv.proto
@@ -1,7 +1,6 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
+the Apache License, Version 2.0 (the "License"); you may not use this file
+except in compliance with the License.
 You may obtain a copy of the License at
 
     http://www.apache.org/licenses/LICENSE-2.0
@@ -13,7 +12,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 syntax = "proto3";
-
 package sendrecv;
 
 service SendRecvService {
@@ -29,12 +27,18 @@ service SendRecvService {
 
 // VariableMessage is serialized paddle variable message.
 // It can be:
-// Tensor
 // LoDTensor
 // SelectedRows
+enum VarType {
+  LOD_TENSOR = 0;
+  SELECTED_ROWS = 1;
+}
+
 message VariableMessage {
   string varname = 1;
-  bytes serialized = 2;
+  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
+  VarType type = 2;
+  bytes serialized = 3;
 }
 
 message VoidMessage {}
diff --git a/paddle/operators/detail/send_recv_impl.h b/paddle/operators/detail/send_recv_impl.h
index 47f730f7ae897096fbdd23a55252448cf6655fb4..1fe54f1f0536aed7d41bbdeeca076534abafe98d 100644
--- a/paddle/operators/detail/send_recv_impl.h
+++ b/paddle/operators/detail/send_recv_impl.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/data_type.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/selected_rows.h"
+#include "paddle/framework/var_type.h"
 #include "paddle/operators/detail/simple_block_queue.h"
 
 #include "paddle/operators/detail/send_recv.grpc.pb.h"
@@ -44,7 +44,7 @@ namespace paddle {
 namespace operators {
 namespace detail {
 
-typedef std::pair<std::string, framework::LoDTensor> TensorWithName;
+typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
 
 class SendRecvServerImpl final : public SendRecvService::Service {
  public:
@@ -60,13 +60,13 @@ class SendRecvServerImpl final : public SendRecvService::Service {
   void Done();
   void SetScope(framework::Scope *scope) { scope_ = scope; };
 
-  const TensorWithName Get() { return this->var_recv_queue_.Pop(); }
+  const MessageWithName Get() { return this->var_recv_queue_.Pop(); }
 
-  void Push(const TensorWithName &msg) { this->var_recv_queue_.Push(msg); }
+  void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); }
 
  private:
   // received variable from RPC, operators fetch variable from this queue.
-  SimpleBlockQueue<TensorWithName> var_recv_queue_;
+  SimpleBlockQueue<MessageWithName> var_recv_queue_;
   framework::Scope *scope_;
   // condition of the sub program
   std::mutex mutex_;
@@ -89,6 +89,53 @@ class RPCClient {
   std::unique_ptr<SendRecvService::Stub> stub_;
 };
 
+inline void SerializeToMessage(const std::string &name,
+                               const framework::Variable *var,
+                               const platform::DeviceContext &ctx,
+                               VariableMessage *msg) {
+  msg->set_varname(name);
+  std::ostringstream oss;
+  switch (framework::ToVarType(var->Type())) {
+    case framework::proto::VarDesc_VarType_LOD_TENSOR:
+      msg->set_type(sendrecv::VarType::LOD_TENSOR);
+      framework::SerializeToStream(oss, var->Get<framework::LoDTensor>(), ctx);
+      break;
+    case framework::proto::VarDesc_VarType_SELECTED_ROWS:
+      msg->set_type(sendrecv::VarType::SELECTED_ROWS);
+      framework::SerializeToStream(oss, var->Get<framework::SelectedRows>(),
+                                   ctx);
+      break;
+    default: {
+      PADDLE_THROW("Serialize does not support type: %s",
+                   typeid(var->Type()).name());
+      break;
+    }
+  }
+  msg->set_serialized(oss.str());
+}
+
+inline void DeserializeFromMessage(const VariableMessage &msg,
+                                   const platform::DeviceContext &ctx,
+                                   framework::Variable *var) {
+  using namespace paddle::framework::proto;
+  std::istringstream iss(msg.serialized());
+  switch (msg.type()) {
+    case sendrecv::VarType::LOD_TENSOR:
+      DeserializeFromStream(iss, var->GetMutable<framework::LoDTensor>(), ctx);
+      break;
+    case sendrecv::VarType::SELECTED_ROWS: {
+      DeserializeFromStream(iss, var->GetMutable<framework::SelectedRows>(),
+                            ctx);
+      break;
+    }
+    default: {
+      PADDLE_THROW("Deserialize does not support type: %s",
+                   typeid(var->Type()).name());
+      break;
+    }
+  }
+}
+
 }  // namespace detail
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc
index 852ecdfe45e7f4737a505c1f722d25457ad6ad32..c74a5b6ced3af27847dd36511aeab0ee9614415a 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -49,7 +49,7 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index 45e9d8df702403e66f9100e4edaf5c17470eb20d..597fdad0794ec076669a0b2e6157e5c74c80d735 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -40,7 +40,7 @@ class GatherOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
@@ -57,7 +57,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index 9ed493a7d027e1bd3e8c5fca376678fd5fcf14f1..2dca05760ecc98f822b8c426af3152d365526d2b 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -60,7 +60,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
index 666207ea07628ca5f2a8313fa3f5febda140a294..975e394c78db037a125adeb2c86e3c74dc0eb6f8 100644
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -183,7 +183,7 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of computation kernel of linear_chain_crf
   // is determined by its input "Emission".
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
@@ -242,7 +242,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of output of the linear_chain_crf_grad
   // operator is determined by its input: gradients of LogLikelihood.
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
index 08b972a233aab8596a5ce7f74ea903df3b8ef0f2..7f551f101f3b3c097f664cc3e9240c2cee7f6830 100644
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -38,10 +38,10 @@ class LoadOp : public framework::OperatorBase {
                    out_var_name);
 
     auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-    DeserializeFromStream(fin, tensor);
 
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
+    DeserializeFromStream(fin, tensor, dev_ctx);
 
     if (platform::is_gpu_place(place)) {
       // copy CPU to GPU
diff --git a/paddle/operators/lod_reset_op.cc b/paddle/operators/lod_reset_op.cc
index f3c0badf2a74431b980abd532e51ba3d251524a1..3d7b15edcfece84ffea539591a4db2690bd82029 100644
--- a/paddle/operators/lod_reset_op.cc
+++ b/paddle/operators/lod_reset_op.cc
@@ -38,7 +38,7 @@ class LoDResetOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
@@ -97,7 +97,7 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
diff --git a/paddle/operators/logical_op.cc b/paddle/operators/logical_op.cc
index 7417192479a13ca9537e2d40f9779a3bf5f1eb61..fedd325cf4f7b5a779d17e0259a16d5cf39b77b7 100644
--- a/paddle/operators/logical_op.cc
+++ b/paddle/operators/logical_op.cc
@@ -99,9 +99,9 @@ class LogicalOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = OperatorWithKernel::GetActualKernelType(ctx);
+    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
     // LogicalOp kernel's device type is decided by input tensor place
     kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
     return kt;
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index 6e5cbd6f8cefc965d6c8d24b16eb3bafde55cc49..bb03def4391da80c6219f7863d300fd3c8d8c7ac 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -41,7 +41,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
@@ -98,7 +98,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc
index b8fcec0f29b46e838f91ad1ee0fded8e42f27bd5..3b90b64b4effacf7240fb1bee8c0aa44251ad727 100644
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@@ -92,7 +92,7 @@ class LSTMOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
@@ -260,7 +260,7 @@ class LSTMGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index 11e047b5d57b6bc18e6d6f4a1d122e18dfc6e357..78263da2fbf843f6a5af2ba95aa0b219a7523b52 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -51,7 +51,7 @@ class MultiplexOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
@@ -102,7 +102,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc
index d39ca87d53518963f652f7b8c8cb289a6fef70fd..84ba3ead2b52547b989a4541f31ea31ffcce6c63 100644
--- a/paddle/operators/nce_op.cc
+++ b/paddle/operators/nce_op.cc
@@ -63,7 +63,7 @@ class NCEOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
@@ -166,7 +166,7 @@ class NCEOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
diff --git a/paddle/operators/net_op.cc b/paddle/operators/net_op.cc
index 78b5e2767842312722fac3509e843a05fe194559..03302f5cbf5674dca1d22a84137579090b4d5eac 100644
--- a/paddle/operators/net_op.cc
+++ b/paddle/operators/net_op.cc
@@ -56,11 +56,11 @@ void NetOp::CompleteAddOp(bool calc) {
   std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs));
 }
 
-std::string NetOp::DebugString() const {
+std::string NetOp::DebugStringEx(const framework::Scope* scope) const {
   std::ostringstream os;
-  os << OperatorBase::DebugString() << std::endl;
+  os << OperatorBase::DebugStringEx(scope) << std::endl;
   for (auto& op : ops_) {
-    std::istringstream is(op->DebugString());
+    std::istringstream is(op->DebugStringEx(scope));
     for (std::string line; std::getline(is, line);) {
       os << "    " << line << std::endl;
     }
diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h
index 85d0153b32c0ba53bfe0912fc2682c8b635ba172..b24042f5ef5822eabcada8ed9d21c552579e8064 100644
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -106,7 +106,8 @@ class NetOp : public framework::OperatorBase {
 
   void CompleteAddOp(bool calculate = true);
 
-  std::string DebugString() const override;
+  std::string DebugStringEx(
+      const framework::Scope* scope = nullptr) const override;
 
   bool IsNetOp() const override;
   std::vector<std::string> OutputVars(bool has_intermediate) const override;
diff --git a/paddle/operators/parallel_do_op.cc b/paddle/operators/parallel_do_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..077245cd83b2535b45927c8350ca7bc14c541d21
--- /dev/null
+++ b/paddle/operators/parallel_do_op.cc
@@ -0,0 +1,293 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "paddle/framework/executor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/threadpool.h"
+
+namespace paddle {
+namespace operators {
+
+static constexpr char kInputs[] = "inputs";
+static constexpr char kParameters[] = "parameters";
+static constexpr char kPlaces[] = "places";
+
+static constexpr char kOutputs[] = "outputs";
+static constexpr char kParallelScopes[] = "parallel_scopes";
+
+static constexpr char kParallelBlock[] = "sub_block";
+
+// using ParallelScopeVar = std::vector<framework::Scope *>;
+using LoDTensor = framework::LoDTensor;
+using OperatorBase = framework::OperatorBase;
+
+void SplitTensorAndMoveTensorToScopes(
+    const framework::Scope &scope,
+    const std::vector<framework::Scope *> &sub_scopes,
+    const std::vector<platform::Place> &places,
+    const std::vector<std::string> &names) {
+  for (auto &argu : names) {
+    auto *var = scope.FindVar(argu);
+    const auto &tensor = var->Get<LoDTensor>();
+    auto lod_tensors = tensor.SplitLoDTensor(places);
+
+    for (auto &lod : lod_tensors) {
+      VLOG(3) << lod.dims();
+    }
+
+    for (size_t i = 0; i < sub_scopes.size(); ++i) {
+      *sub_scopes[i]->Var(argu)->GetMutable<LoDTensor>() = lod_tensors[i];
+    }
+  }
+}
+
+class ParallelDoOp : public framework::OperatorBase {
+ public:
+  ParallelDoOp(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
+    auto *program = block->Program();
+
+    // TODO(tonyyang-svail): get places from input
+    std::vector<platform::Place> places;
+    places.emplace_back(platform::CPUPlace());
+    places.emplace_back(platform::CPUPlace());
+
+    auto &sub_scopes = *scope.FindVar(Output(kParallelScopes))
+                            ->GetMutable<std::vector<framework::Scope *>>();
+    for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
+      sub_scopes.push_back(&scope.NewScope());
+    }
+
+    SplitTensorAndMoveTensorToScopes(scope, sub_scopes, places,
+                                     Inputs(kInputs));
+
+    std::vector<std::future<void>> workers;
+    workers.reserve(places.size());
+    for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
+      VLOG(3) << "Run " << place_idx;
+
+      auto &place = places[place_idx];
+      auto *cur_scope = sub_scopes[place_idx];
+
+      // copy parameter
+      // some version of boost lacks != for boost::variant
+      if (!(dev_ctx.GetPlace() == place)) {
+        PADDLE_THROW("Not Implemented");
+      }
+
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
+    }
+    for (auto &worker : workers) {
+      worker.wait();
+    }
+
+    // merge output
+    for (auto &o_name : Outputs(kOutputs)) {
+      std::vector<const framework::LoDTensor *> lod_tensors;
+      lod_tensors.reserve(sub_scopes.size());
+      for (auto *sub_scope : sub_scopes) {
+        lod_tensors.emplace_back(&sub_scope->FindVar(o_name)->Get<LoDTensor>());
+      }
+
+      auto *lod_tensor_to_be_merged =
+          scope.FindVar(o_name)->GetMutable<LoDTensor>();
+      lod_tensor_to_be_merged->MergeLoDTensor(lod_tensors, dev_ctx.GetPlace());
+    }
+  }
+};
+
+class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ParallelDoOpProtoMaker(OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kInputs, "").AsDuplicable();
+    AddInput(kParameters, "").AsDuplicable();
+    AddInput(kPlaces, "");
+    AddOutput(kOutputs, "").AsDuplicable();
+    AddOutput(kParallelScopes, "");
+    AddAttr<framework::BlockDesc *>(kParallelBlock, "");
+    AddComment(R"DOC(
+ParallelDo Operator.
+)DOC");
+  }
+};
+
+class ParallelDoGradOp : public OperatorBase {
+ public:
+  ParallelDoGradOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    // // get device context from pool
+    // platform::DeviceContextPool &pool =
+    //        platform::DeviceContextPool::Instance();
+    // auto &dev_ctx = *pool.Get(place);
+
+    auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
+    auto *program = block->Program();
+
+    auto &sub_scopes = scope.FindVar(Input(kParallelScopes))
+                           ->Get<std::vector<framework::Scope *>>();
+
+    // TODO(tonyyang-svail): get places from input
+    std::vector<platform::Place> places;
+    places.emplace_back(platform::CPUPlace());
+    places.emplace_back(platform::CPUPlace());
+
+    // feed output@grad
+    SplitTensorAndMoveTensorToScopes(scope, sub_scopes, places,
+                                     Inputs(framework::GradVarName(kOutputs)));
+
+    for (auto &s : Inputs(framework::GradVarName(kOutputs))) {
+      VLOG(3) << s;
+      VLOG(3) << scope.FindVar(s)->Get<LoDTensor>();
+      for (auto *sub_scope : sub_scopes) {
+        VLOG(3) << sub_scope->FindVar(s)->Get<LoDTensor>();
+      }
+    }
+
+    // exe run
+    std::vector<std::future<void>> workers;
+    for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
+      VLOG(3) << "Run " << place_idx;
+
+      auto &place = places[place_idx];
+      auto *cur_scope = sub_scopes[place_idx];
+
+      // execute
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
+    }
+    for (auto &worker : workers) {
+      worker.wait();
+    }
+
+    // merge grad
+    for (auto &s : Outputs(framework::GradVarName(kParameters))) {
+      VLOG(3) << s;
+
+      auto &t = sub_scopes[0]->FindVar(s)->Get<LoDTensor>();
+      VLOG(3) << t;
+
+      std::string s_buf = s + "@BUF";
+      auto *t_buf = sub_scopes[0]->Var(s_buf)->GetMutable<LoDTensor>();
+
+      for (size_t place_idx = 1; place_idx < places.size(); ++place_idx) {
+        auto &tt = sub_scopes[place_idx]->FindVar(s)->Get<LoDTensor>();
+        VLOG(3) << place_idx;
+        VLOG(3) << tt;
+        framework::CopyFrom(tt, places[0], t_buf);
+
+        auto sum_op = framework::OpRegistry::CreateOp(
+            "sum", {{"X", {s, s_buf}}}, {{"Out", {s}}},
+            framework::AttributeMap{});
+        sum_op->Run(*sub_scopes[0], place);
+      }
+
+      VLOG(3) << t;
+      framework::CopyFrom(t, place, scope.FindVar(s)->GetMutable<LoDTensor>());
+    }
+  }
+};
+
+class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  virtual std::unique_ptr<framework::OpDesc> Apply() const {
+    auto *grad = new framework::OpDesc();
+    grad->SetType("parallel_do_grad");
+    for (auto &input_param : this->InputNames()) {
+      VLOG(3) << input_param;
+      grad->SetInput(input_param, this->Input(input_param));
+      grad->SetOutput(framework::GradVarName(input_param),
+                      this->InputGrad(input_param, false));
+    }
+
+    for (auto &output_param : this->OutputNames()) {
+      if (output_param == kParallelScopes) {
+        grad->SetInput(output_param, this->Output(output_param));
+        grad->SetInput(framework::GradVarName(output_param),
+                       this->Output(output_param));
+      } else {
+        grad->SetInput(output_param, this->Output(output_param));
+        grad->SetInput(framework::GradVarName(output_param),
+                       this->OutputGrad(output_param));
+      }
+    }
+    grad->SetAttrMap(this->Attrs());
+    grad->SetBlockAttr(kParallelBlock, *grad_block_[0]);
+
+    return std::unique_ptr<framework::OpDesc>(grad);
+  }
+};
+
+class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    std::vector<std::string> input{kParameters, kInputs};
+    std::vector<std::string> output{kOutputs};
+    for (auto &s : input) {
+      PADDLE_ENFORCE(ctx->HasInputs(s));
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)),
+                     "Cannot find the gradient variable %s",
+                     framework::GradVarName(s));
+    }
+    for (auto &s : output) {
+      PADDLE_ENFORCE(ctx->HasInputs(s));
+    }
+    for (auto &s : input) {
+      ctx->SetOutputsDim(framework::GradVarName(s), ctx->GetInputsDim(s));
+    }
+    if (ctx->HasInputs(kParameters)) {
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
+      ctx->SetOutputsDim(framework::GradVarName(kParameters),
+                         ctx->GetInputsDim(kParameters));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp,
+                  paddle::operators::ParallelDoOpProtoMaker,
+                  paddle::operators::ParallelDoGradOpDescMaker);
+REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp,
+                  paddle::operators::ParallelDoGradOpShapeInference);
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index 50057eb6483e9c9e745bc07dee26a0bbbbb5a48c..d3cf5fa638c53dfdfacec153211f447a1e2fa3bf 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -58,6 +58,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
         OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
   }
   ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  ctx->ShareLoD("X", "Out");
 }
 
 void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index 76c5123527c3ff5b7e6c7eec39f4eb1d612759d3..1d31d813af4ec4eb829b906fa9add38cc71d54f3 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -69,7 +69,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
@@ -90,7 +90,7 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
diff --git a/paddle/operators/positive_negative_pair_op.cc b/paddle/operators/positive_negative_pair_op.cc
index a6b23c995b8b9104f2da9d3d29ceb3eb88e7da63..5aa5167dbb83c9caf2e754859938e51700d8ec3f 100644
--- a/paddle/operators/positive_negative_pair_op.cc
+++ b/paddle/operators/positive_negative_pair_op.cc
@@ -85,7 +85,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Score")->type()),
diff --git a/paddle/operators/precision_recall_op.cc b/paddle/operators/precision_recall_op.cc
index c5753147effd17c012683e1058e34af46288f366..f1598d53cae2a58acd0207a20938b4f744ba0efe 100644
--- a/paddle/operators/precision_recall_op.cc
+++ b/paddle/operators/precision_recall_op.cc
@@ -80,7 +80,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type()),
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
index 322f8571cfd4341f064e8f9df512a8d74b91ed9d..82fceb3da7d396bcfc1d95baccc4ee36b87f4d39 100644
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -60,7 +60,7 @@ class RecvOp : public framework::OperatorBase {
   }
 
   void Stop() override {
-    detail::TensorWithName term_msg;
+    detail::MessageWithName term_msg;
     term_msg.first = LISTEN_TERMINATE_MESSAGE;
     rpc_service_->Push(term_msg);
     rpc_server_->Shutdown();
@@ -94,7 +94,7 @@ class RecvOp : public framework::OperatorBase {
       // the gradient arrives, just add suffix 0~n then average the gradient.
       for (size_t i = 0; i < param_count * trainer_count; ++i) {
         // blocking get one var from client.
-        const detail::TensorWithName &v = rpc_service_->Get();
+        const detail::MessageWithName &v = rpc_service_->Get();
         auto grad_var_name = v.first;
         if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
           exit_flag = true;
@@ -121,11 +121,10 @@ class RecvOp : public framework::OperatorBase {
         }
 
         auto *var = recv_scope.Var(grad_var_name);
-        auto *tensor = var->GetMutable<framework::LoDTensor>();
-        // FIXME(typhoonzero): do not copy
-        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-        auto &dev_ctx = *pool.Borrow(dev_place);
-        framework::CopyFrom(v.second, dev_place, dev_ctx, tensor);
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(dev_place);
+        detail::DeserializeFromMessage(v.second, dev_ctx, var);
       }
       if (exit_flag) {
         break;
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
index a3ff4a6ca0ef30be42e7801386a3561930638a8a..172d28bb3b647901d4de7bc03c9de21e3468a364 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -77,6 +77,7 @@ class ReduceGradOp : public framework::OperatorWithKernel {
     auto x_grad_name = framework::GradVarName("X");
     if (ctx->HasOutput(x_grad_name)) {
       ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
     }
   }
 };
diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
index 8d652ff806461cea3d0e8d3bd70704b4b6bc2173..0fa615d8743998448281f87d1ce3d8aea7f6b624 100644
--- a/paddle/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
@@ -88,20 +88,33 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
   std::vector<AbsoluteRankTableItem> GetAbsoluteOffsetAndLengthByLoDRankTable(
       const framework::LoDTensor &x) const {
     std::vector<AbsoluteRankTableItem> absolute_table;
-    size_t level = 0;
-    size_t size = x.lod()[level].size();
 
-    for (size_t i = 0; i < size - 1; ++i) {
-      auto lod_offset =
-          framework::GetSubLoDAndAbsoluteOffset(x.lod(), i, i + 1, level);
+    if (x.lod().empty()) {
+      // For Tensor without lod, such as the output of sequence_pool_op
+      size_t size = x.dims()[0];
+      absolute_table.reserve(size);
+      for (size_t i = 0; i < size; ++i) {
+        absolute_table.emplace_back();
+        absolute_table.back().length = 1;
+        absolute_table.back().offset = i;
+      }
+    } else {
+      size_t level = 0;
+      size_t size = x.lod()[level].size();
+
+      for (size_t i = 0; i < size - 1; ++i) {
+        auto lod_offset =
+            framework::GetSubLoDAndAbsoluteOffset(x.lod(), i, i + 1, level);
 
-      auto &offset = lod_offset.second;
+        auto &offset = lod_offset.second;
 
-      absolute_table.emplace_back();
-      absolute_table.back().length = offset.second - offset.first;
-      absolute_table.back().offset = offset.first;
-      absolute_table.back().lod = lod_offset.first;
+        absolute_table.emplace_back();
+        absolute_table.back().length = offset.second - offset.first;
+        absolute_table.back().offset = offset.first;
+        absolute_table.back().lod = lod_offset.first;
+      }
     }
+
     return absolute_table;
   }
 
diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc
index ef1804d9762200686ac8537140af046c21443779..a7351f11c5da7b850681346942ad699aba85a8e0 100644
--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
@@ -68,7 +68,7 @@ class ROIPoolOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
@@ -89,7 +89,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index 806dccc6ca78bf64da828fe13e08e043097bd939..b65334890633f54e70179bfa8fe5463901f7947e 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -49,7 +49,7 @@ class ScatterOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
@@ -68,7 +68,7 @@ class ScatterGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc
index 108e2dec6b3eecadd431fd25f9a31ec17a874b6b..fa94424bf9e8e719ec0822268685b0806a109d21 100644
--- a/paddle/operators/send_recv_op_test.cc
+++ b/paddle/operators/send_recv_op_test.cc
@@ -20,22 +20,27 @@ limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/program_desc.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
 #include "paddle/string/printf.h"
 
 USE_NO_KERNEL_OP(send);
 USE_NO_KERNEL_OP(recv);
 USE_OP(sum);
 
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
 // global for simplicity.
-std::unique_ptr<paddle::framework::OperatorBase> recv_op;
+std::unique_ptr<f::OperatorBase> recv_op;
 
-void InitTensorsInScope(paddle::framework::Scope &scope,
-                        paddle::platform::CPUPlace &place) {
-  paddle::platform::CPUDeviceContext ctx(place);
+void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
+  p::CPUDeviceContext ctx(place);
   for (int i = 0; i < 2; ++i) {
     auto var_name = paddle::string::Sprintf("x%d", i);
     auto var = scope.Var(var_name);
-    auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+    auto tensor = var->GetMutable<f::LoDTensor>();
     tensor->Resize({10, 10});
     float *expect = tensor->mutable_data<float>(place);
     for (int64_t i = 0; i < tensor->numel(); ++i) {
@@ -44,21 +49,53 @@ void InitTensorsInScope(paddle::framework::Scope &scope,
   }
 
   auto out_var = scope.Var("Out");
-  auto out_tensor = out_var->GetMutable<paddle::framework::LoDTensor>();
+  auto out_tensor = out_var->GetMutable<f::LoDTensor>();
   out_tensor->Resize({10, 10});
   out_tensor->mutable_data<float>(place);  // allocate
 }
 
-void AddOp(const std::string &type,
-           const paddle::framework::VariableNameMap &inputs,
-           const paddle::framework::VariableNameMap &outputs,
-           paddle::framework::AttributeMap attrs,
-           paddle::framework::BlockDesc *block) {
+void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
+  p::CPUDeviceContext ctx(place);
+  int64_t height = 10;
+  int64_t row_numel = 10;
+  m::SetConstant<p::CPUDeviceContext, float> set_one;
+  // init x0
+  std::vector<int64_t> rows0{0, 4, 7};
+  auto x0_var = scope.Var("x0");
+  auto x0 = x0_var->GetMutable<f::SelectedRows>();
+  x0->set_rows(rows0);
+  x0->set_height(height);
+  auto x0_value = x0->mutable_value();
+  x0_value->mutable_data<float>(
+      f::make_ddim({static_cast<int64_t>(rows0.size()), row_numel}), place);
+  set_one(ctx, x0_value, 1.0);
+
+  // init x1
+  std::vector<int64_t> rows1{2, 9};
+  auto x1_var = scope.Var("x1");
+  auto x1 = x1_var->GetMutable<f::SelectedRows>();
+  x1->set_rows(rows1);
+  x1->set_height(height);
+  auto x1_value = x1->mutable_value();
+  x1_value->mutable_data<float>(
+      f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place);
+  set_one(ctx, x1_value, 1.0);
+
+  auto out_var = scope.Var("Out");
+  auto out = out_var->GetMutable<f::SelectedRows>();
+  auto out_value = out->mutable_value();
+  out->set_height(height);
+  out_value->mutable_data<float>(f::make_ddim({5, 10}), place);
+}
+
+void AddOp(const std::string &type, const f::VariableNameMap &inputs,
+           const f::VariableNameMap &outputs, f::AttributeMap attrs,
+           f::BlockDesc *block) {
   // insert output
   for (auto kv : outputs) {
     for (auto v : kv.second) {
       auto var = block->Var(v);
-      var->SetDataType(paddle::framework::proto::DataType::FP32);
+      var->SetDataType(f::proto::DataType::FP32);
     }
   }
 
@@ -74,58 +111,99 @@ void AddOp(const std::string &type,
   op->SetAttrMap(attrs);
 }
 
-void StartServerNet() {
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace place;
-  InitTensorsInScope(scope, place);
+void StartServerNet(bool is_sparse) {
+  f::Scope scope;
+  p::CPUPlace place;
+  if (is_sparse) {
+    InitSelectedRowsInScope(scope, place);
+  } else {
+    InitTensorsInScope(scope, place);
+  }
 
   // sub program run in recv_op, for simple test we use sum
-  paddle::framework::ProgramDesc program;
-  paddle::framework::BlockDesc *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
   // X for server side tensors, RX for received tensers, must be of same shape.
-  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"x0"}}}, {}, block);
+  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, block);
 
-  paddle::framework::AttributeMap attrs;
+  f::AttributeMap attrs;
   attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
-  attrs.insert({"ParamList", std::vector<std::string>({"x0"})});
+  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
   attrs.insert({"GradList", std::vector<std::string>({"x1"})});
   std::string program_proto;
   PADDLE_ENFORCE(program.Proto()->SerializeToString(&program_proto));
 
   attrs.insert({"OptimizeProgram", program_proto});
-  recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"x1"}}},
-                                                    {}, attrs);
+  recv_op = f::OpRegistry::CreateOp("recv", {{"RX", {"x1"}}}, {}, attrs);
   recv_op->Run(scope, place);
 }
 
-TEST(SendRecvOp, CPU) {
-  std::thread server_thread(StartServerNet);
-  sleep(5);  // wait server to start
+TEST(SendRecvOp, CPUDense) {
+  std::thread server_thread(StartServerNet, false);
+  sleep(3);  // wait server to start
   // local net
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace place;
+  f::Scope scope;
+  p::CPUPlace place;
   InitTensorsInScope(scope, place);
 
-  paddle::framework::AttributeMap attrs;
+  f::AttributeMap attrs;
   attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
   attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
-  auto send_op = paddle::framework::OpRegistry::CreateOp(
-      "send", {{"X", {"x1"}}}, {{"Out", {"x0"}}}, attrs);
+  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
+                                         {{"Out", {"Out"}}}, attrs);
   send_op->Run(scope, place);
 
   auto in_var = scope.Var("x1");
-  auto tensor = in_var->GetMutable<paddle::framework::LoDTensor>();
+  auto tensor = in_var->GetMutable<f::LoDTensor>();
   float *expected = tensor->data<float>();
-  auto out_var = scope.Var("x0");
-  auto target = out_var->GetMutable<paddle::framework::LoDTensor>();
+  auto out_var = scope.Var("Out");
+  auto target = out_var->GetMutable<f::LoDTensor>();
   // x1 * 2 == x0
   EXPECT_NE(target->memory_size(), size_t(0));
   float *actual = target->data<float>();
   for (int64_t i = 0; i < target->numel(); ++i) {
     EXPECT_EQ(expected[i] * 2, actual[i]);
   }
+  recv_op->Stop();
+  server_thread.join();
+  recv_op.reset(nullptr);
+}
 
+TEST(SendRecvOp, CPUSparse) {
+  std::thread server_thread(StartServerNet, true);
+  sleep(3);  // wait server to start
+  // local net
+  f::Scope scope;
+  p::CPUPlace place;
+  p::CPUDeviceContext ctx(place);
+  InitSelectedRowsInScope(scope, place);
+  f::AttributeMap attrs;
+  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
+  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
+  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
+                                         {{"Out", {"Out"}}}, attrs);
+  send_op->Run(scope, place);
+
+  auto x0 = scope.Var("x0")->GetMutable<f::SelectedRows>();
+  auto x1 = scope.Var("x1")->GetMutable<f::SelectedRows>();
+  auto out = scope.Var("Out")->GetMutable<f::SelectedRows>();
+  auto actual = out->mutable_value();
+
+  std::unique_ptr<f::SelectedRows> expect{new f::SelectedRows()};
+  auto expect_value = expect->mutable_value();
+  expect_value->mutable_data<float>(f::make_ddim({5, 10}), place);
+
+  m::SelectedRowsAdd<p::CPUDeviceContext, float> add_functor;
+  add_functor(ctx, *x0, *x1, expect.get());
+
+  EXPECT_EQ(actual->numel(), expect_value->numel());
+  EXPECT_EQ(out->rows().size(), x0->rows().size() + x1->rows().size());
+
+  for (int64_t i = 0; i < expect_value->numel(); ++i) {
+    EXPECT_EQ(expect_value->mutable_data<float>(place)[i],
+              actual->mutable_data<float>(place)[i]);
+  }
   recv_op->Stop();
   server_thread.join();
-  // recv_op.reset();
+  recv_op.reset();
 }
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
index aea98744d8fc1fc59a07250d57f76f26fb9f3634..34e1a12591515e2363651a2722f52963f7ae43b5 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -107,7 +107,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc
index 98bd8854903e5abf6d27432a2af0aaae980c0b1d..f79106ff0f7a3d0918bc3a5c428179cb170ffc79 100644
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
@@ -48,7 +48,7 @@ class SequenceSliceOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
@@ -69,7 +69,7 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index 41e65b701e62bd2e671f3590869a5d7fed90701c..7135780c92dca00503ab098dc6930afd8fac0be8 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -118,7 +118,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
@@ -159,7 +159,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index b86e8266425ca094a51d224fd39ce33700057f13..a4c08430d85ae418ec6a0c0e8e954415711cd23f 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -53,7 +53,7 @@ class SumOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto x_vars = ctx.MultiInputVar("X");
     if (x_vars[0]->IsType<framework::LoDTensor>()) {
diff --git a/paddle/operators/tensor.save b/paddle/operators/tensor.save
deleted file mode 100644
index c24308a7d0131b84c28c0a9857cce4949afb2091..0000000000000000000000000000000000000000
Binary files a/paddle/operators/tensor.save and /dev/null differ
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 4d5dd86cb8103a76247913cc088db4cff6b6ff43..3a314bdb9b058f5dfc61b35795000298c40551e6 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -63,7 +63,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index aeed9679b2a3cce744189809c223a5b0d581ccdd..50cee11a7a2d6a2483c6851e888b61475cba1376 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -71,7 +71,7 @@ int OutputSize(int input_size, int ksize, int padding, int stride) {
 
 class UnpoolOp : public framework::OperatorWithKernel {
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
@@ -110,7 +110,7 @@ class UnpoolOp : public framework::OperatorWithKernel {
 
 class UnpoolOpGrad : public framework::OperatorWithKernel {
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
index 728ef6079465d57f54dab383aac5e2bb750fe113..65d827e0e0c5cfc3897c1fd0b971b766201cc1e2 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -25,12 +25,12 @@ namespace operators {
 using StepScopeVar = std::vector<framework::Scope *>;
 using LoDTensor = framework::LoDTensor;
 
-constexpr char kStepBlock[] = "sub_block";
-constexpr char kCondition[] = "Condition";
-constexpr char kStepScopes[] = "StepScopes";
-constexpr char kParameters[] = "X";
-constexpr char kParamGrads[] = "X@GRAD";
-constexpr char kOutputs[] = "Out";
+static constexpr char kStepBlock[] = "sub_block";
+static constexpr char kCondition[] = "Condition";
+static constexpr char kStepScopes[] = "StepScopes";
+static constexpr char kX[] = "X";
+static constexpr char kXGRAD[] = "X@GRAD";
+static constexpr char kOutputs[] = "Out";
 
 class WhileOp : public framework::OperatorBase {
  public:
@@ -67,7 +67,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(kParameters,
+    AddInput(kX,
              "A set of variables, which are required by operators inside the "
              "block of While Op.")
         .AsDuplicable();
@@ -158,8 +158,8 @@ class WhileGradOp : public framework::OperatorBase {
 
       executor.Run(*program, *cur_scope_iter, block->ID(), false);
 
-      auto &pg_names = Outputs(kParamGrads);
-      auto &p_names = Inputs(kParameters);
+      auto &pg_names = Outputs(kXGRAD);
+      auto &p_names = Inputs(kX);
       PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
       for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
         if (pg_names[param_id] == framework::kEmptyVarName) {
@@ -213,11 +213,11 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
   std::unique_ptr<framework::OpDesc> Apply() const override {
     auto *grad = new framework::OpDesc();
     grad->SetType("while_grad");
-    grad->SetInput(kParameters, Input(kParameters));
+    grad->SetInput(kX, Input(kX));
 
     // Not all of IGs will be generated by inner gradient operators of while op.
     // Ignore IGs that is not generated by the inside block.
-    auto igs = InputGrad(kParameters, /*do not drop empty gradient*/ false);
+    auto igs = InputGrad(kX, /*do not drop empty gradient*/ false);
     std::unordered_set<std::string> all_outs;
     for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) {
       for (auto &oname : grad_block_[0]->Op(i)->OutputArgumentNames()) {
@@ -231,7 +231,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
       }
     }
 
-    grad->SetOutput(framework::GradVarName(kParameters), igs);
+    grad->SetOutput(framework::GradVarName(kX), igs);
 
     grad->SetInput(kOutputs, Output(kOutputs));
 
@@ -240,7 +240,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     std::unordered_set<std::string> block_ins;
     auto *fwd_block = this->grad_block_[0]->ParentBlock();
     {
-      for (auto &p : Input(kParameters)) {
+      for (auto &p : Input(kX)) {
         block_ins.insert(p);
       }
       for (auto &o : Output(kOutputs)) {
@@ -288,8 +288,8 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
  public:
   void operator()(const framework::OpDesc &op_desc,
                   framework::BlockDesc *block) const override {
-    auto p_names = op_desc.Input(kParameters);
-    auto pg_names = op_desc.Output(framework::GradVarName(kParameters));
+    auto p_names = op_desc.Input(kX);
+    auto pg_names = op_desc.Output(framework::GradVarName(kX));
 
     for (size_t i = 0; i < p_names.size(); ++i) {
       auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
@@ -307,21 +307,21 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
 class WhileGradOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {
-    ctx->HasInputs(kParameters);
-    ctx->HasOutputs(framework::GradVarName(kParameters));
+    ctx->HasInputs(kX);
+    ctx->HasOutputs(framework::GradVarName(kX));
     ctx->HasInputs(kOutputs);
     ctx->HasInputs(framework::GradVarName(kOutputs));
 
-    auto p_names = ctx->Inputs(kParameters);
-    auto pg_names = ctx->Outputs(kParamGrads);
-    auto var_types = ctx->GetInputsVarType(kParameters);
+    auto p_names = ctx->Inputs(kX);
+    auto pg_names = ctx->Outputs(kXGRAD);
+    auto var_types = ctx->GetInputsVarType(kX);
     std::vector<std::string> names_to_set;
     std::vector<framework::DDim> dims_to_set;
     for (size_t i = 0; i < p_names.size(); ++i) {
       if (pg_names[i] == framework::kEmptyVarName) {
         continue;
       }
-      auto dims = ctx->GetInputsElementDim(kParameters, i);
+      auto dims = ctx->GetInputsElementDim(kX, i);
       if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) {
         names_to_set.push_back(pg_names[i]);
         dims_to_set.push_back(dims);
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 8c4803b9739bb54cae89de62468a47631a5dde94..44f6d85cd1510f309595ca711de2e0f767219580 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -21,10 +21,16 @@ ELSE()
     set(GPU_CTX_DEPS)
 ENDIF()
 
+IF(WITH_MKLDNN)
+    set(MKLDNN_CTX_DEPS mkldnn)
+ELSE()
+    set(MKLDNN_CTX_DEPS)
+ENDIF()
+
 # memcpy deoends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
-    system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
+    system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index ea07f2e002cb76d09a11f7a5305c2d45b780e7bd..9d9348079a0179418ab1a1474cfd8b69136f26b2 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -127,15 +127,21 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
   eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
   PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
   PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
-  PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
-  PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
+  if (dynload::HasCUDNN()) {
+    PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
+    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
+  } else {
+    cudnn_handle_ = nullptr;
+  }
 }
 
 CUDADeviceContext::~CUDADeviceContext() {
   SetDeviceId(place_.device);
   Wait();
   PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
-  PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
+  if (cudnn_handle_ != nullptr) {
+    PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
+  }
   eigen_stream_.reset();
   eigen_device_.reset();
   PADDLE_ENFORCE(cudaStreamDestroy(stream_));
@@ -160,19 +166,69 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
 
 cudaStream_t CUDADeviceContext::stream() const { return stream_; }
 
-CUDNNDeviceContext::CUDNNDeviceContext(CUDAPlace place)
-    : CUDADeviceContext(place) {
-  PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
-  PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream()));
+#endif
+
+#ifdef PADDLE_WITH_MKLDNN
+MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
+    : CPUDeviceContext(place), ready_(false) {
+  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
 }
 
-CUDNNDeviceContext::~CUDNNDeviceContext() {
-  SetDeviceId(boost::get<CUDAPlace>(GetPlace()).device);
-  Wait();
-  PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
+template <typename T>
+void MKLDNNDeviceContext::AddElement(const std::string& op_key,
+                                     const T& value) {
+  if (GetElement<T>(op_key)) {
+    return;
+  }
+  GetElementPool<T>().emplace(op_key, std::move(value));
+}
+
+template <typename T>
+const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const {
+  auto it = GetElementPool<T>().find(op_key);
+  return it == GetElementPool<T>().end() ? nullptr : it->second;
+}
+
+template <>
+const std::unordered_map<const std::string, const MKLDNNMemoryPtr,
+                         std::hash<std::string>>&
+MKLDNNDeviceContext::GetElementPool<MKLDNNMemoryPtr>() const {
+  return memory_pool_;
 }
 
-cudnnHandle_t CUDNNDeviceContext::cudnn_handle() const { return cudnn_handle_; }
+template <>
+const std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
+                         std::hash<std::string>>&
+MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitivePtr>() const {
+  return primitive_pool_;
+}
+
+template <>
+const std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
+                         std::hash<std::string>>&
+MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitiveDescPtr>() const {
+  return primitive_desc_pool_;
+}
+
+void MKLDNNDeviceContext::Execute(bool block) {
+  if (pipeline_.empty()) {
+    return;
+  }
+  ResetStream();
+  stream_->submit(pipeline_).wait(block);
+  ready_ = false;
+  pipeline_.clear();
+}
+
+void MKLDNNDeviceContext::ResetStream() {
+  if (ready_) {
+    return;
+  }
+  // TODO(TJ): change me when mkldnn have specific method to reset this state
+  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+  ready_ = true;
+}
 
 #endif
 
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 2b366e6383d23e2d31a194edd04412892a8311eb..7a0040c9c229af79ea8be1049dfd6c0d1b4d19cf 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -21,6 +21,10 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #endif
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/platform/mkldnn_helper.h"
+#endif
+
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -103,18 +107,54 @@ struct DefaultDeviceContextType<platform::CUDAPlace> {
   using TYPE = CUDADeviceContext;
 };
 
-class CUDNNDeviceContext : public CUDADeviceContext {
+#endif
+
+#ifdef PADDLE_WITH_MKLDNN
+class MKLDNNDeviceContext : public CPUDeviceContext {
  public:
-  explicit CUDNNDeviceContext(CUDAPlace place);
-  virtual ~CUDNNDeviceContext();
+  explicit MKLDNNDeviceContext(CPUPlace place);
 
-  /*! \brief  Return cudnn  handle in the device context. */
-  cudnnHandle_t cudnn_handle() const;
+  /* \brief  Add new element: memory, primitive or primitive desc */
+  template <typename T>
+  void AddElement(const std::string& op_key, const T& value);
+
+  /* \brief  Get existed element: memory, primitive or primitive desc */
+  template <typename T>
+  const T& GetElement(const std::string& op_key) const;
+
+  /* \brief  Get element pool: memory, primitive or primitive desc pool */
+  template <typename T>
+  const std::unordered_map<const std::string, const T, std::hash<std::string>>&
+  GetElementPool() const;
+
+  /* \brief  Get the active engine */
+  const MKLDNNEngine& engine() const { return *engine_; }
+
+  /* \brief  Submit primitive to pipeline */
+  void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); }
+
+  /*! \brief  Execute all submitted primitives in pipeline */
+  void Execute(bool block = true);
+
+ protected:
+  /*! \brief  Reset the stream to prepare next exectue */
+  void ResetStream();
 
  private:
-  cudnnHandle_t cudnn_handle_;
+  std::unordered_map<const std::string, const MKLDNNMemoryPtr,
+                     std::hash<std::string>>
+      memory_pool_;
+  std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
+                     std::hash<std::string>>
+      primitive_pool_;
+  std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
+                     std::hash<std::string>>
+      primitive_desc_pool_;
+  std::vector<MKLDNNPrimitive> pipeline_;
+  MKLDNNStreamPtr stream_;
+  MKLDNNEnginePtr engine_;
+  bool ready_;
 };
-
 #endif
 
 /*! \brief device context pool singleton */
@@ -151,7 +191,7 @@ class DeviceContextPool {
   struct Hash {
     std::hash<int> hash_;
     size_t operator()(const platform::Place& place) const {
-      int pre_hash = place.which() + (1 << LEFT_SHIFT);
+      int pre_hash = place.which() << LEFT_SHIFT;
       if (platform::is_gpu_place(place)) {
         pre_hash += boost::get<platform::CUDAPlace>(place).GetDeviceId();
       }
diff --git a/paddle/platform/device_context_test.cu b/paddle/platform/device_context_test.cu
index ca10cf34639376798bf5ba05970c9c734e5a1ef8..767fe9b24a5d51630397a864c98928de52d21f31 100644
--- a/paddle/platform/device_context_test.cu
+++ b/paddle/platform/device_context_test.cu
@@ -49,21 +49,6 @@ TEST(Device, CUDADeviceContext) {
   }
 }
 
-TEST(Device, CUDNNDeviceContext) {
-  using paddle::platform::CUDNNDeviceContext;
-  using paddle::platform::CUDAPlace;
-  if (paddle::platform::dynload::HasCUDNN()) {
-    int count = paddle::platform::GetCUDADeviceCount();
-    for (int i = 0; i < count; ++i) {
-      CUDNNDeviceContext* device_context = new CUDNNDeviceContext(CUDAPlace(i));
-      cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
-      ASSERT_NE(nullptr, cudnn_handle);
-      ASSERT_NE(nullptr, device_context->stream());
-      delete device_context;
-    }
-  }
-}
-
 TEST(Device, DeviceContextPool) {
   using paddle::platform::DeviceContextPool;
   using paddle::platform::CUDADeviceContext;
diff --git a/paddle/platform/mkldnn_helper.h b/paddle/platform/mkldnn_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd52a8b4c434071a030a8e7a8a70fc3adba8460c
--- /dev/null
+++ b/paddle/platform/mkldnn_helper.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mkldnn.hpp>
+
+namespace paddle {
+namespace platform {
+
+using MKLDNNStream = mkldnn::stream;
+using MKLDNNEngine = mkldnn::engine;
+using MKLDNNMemory = mkldnn::memory;
+using MKLDNNPrimitive = mkldnn::primitive;
+using MKLDNNPrimitiveDesc = mkldnn::handle<mkldnn_primitive_desc_t>;
+
+typedef std::unique_ptr<MKLDNNStream> MKLDNNStreamPtr;
+typedef std::unique_ptr<MKLDNNEngine> MKLDNNEnginePtr;
+typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
+typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr;
+typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr;
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
index 249527e3e136992970033c44ad490a1744bfed35..f05260ccac4c2b11e5568d484141720fa4792d13 100644
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -51,6 +51,18 @@ bool places_are_same_class(const Place &p1, const Place &p2) {
   return p1.which() == p2.which();
 }
 
+bool is_same_place(const Place &p1, const Place &p2) {
+  if (places_are_same_class(p1, p2)) {
+    if (is_cpu_place(p1)) {
+      return true;
+    } else {
+      return boost::get<CUDAPlace>(p1) == boost::get<CUDAPlace>(p2);
+    }
+  } else {
+    return false;
+  }
+}
+
 std::ostream &operator<<(std::ostream &os, const Place &p) {
   detail::PlacePrinter printer(os);
   boost::apply_visitor(printer, p);
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index 76b5c502cc48431a4e9b13b07505978884576e1d..ba32dd3be6199371bf4624be60a2955a320dc22c 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -61,6 +61,7 @@ const CPUPlace default_cpu();
 bool is_gpu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
+bool is_same_place(const Place &, const Place &);
 
 std::ostream &operator<<(std::ostream &, const Place &);
 
diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc
index 4e89e5c600bf7f6e23faf8d62bc7c72f7ee8ca7a..4283724d28afcbdeaceb36e6eafa3e0e2fd9f1f9 100644
--- a/paddle/platform/profiler.cc
+++ b/paddle/platform/profiler.cc
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/platform/profiler.h"
+#include <iomanip>
+#include <map>
+#include "glog/logging.h"
 
 namespace paddle {
 namespace platform {
 
 // The profiler state, the initial value is ProfilerState::kDisabled
 static ProfilerState g_state = ProfilerState::kDisabled;
+// To record which timer the profiler used, CUDA or CPU.
+static std::string g_profiler_place = "";
 // The thread local event list only can be accessed by the specific thread
 // The thread index of each thread
 static thread_local int32_t g_thread_id;
@@ -43,10 +48,7 @@ inline uint64_t GetTimeInNsec() {
 
 Event::Event(EventKind kind, std::string name, uint32_t thread_id,
              DeviceContext* dev_ctx)
-    : kind_(kind),
-      name_(std::move(name)),
-      thread_id_(thread_id),
-      has_cuda_(false) {
+    : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
 #ifdef PADDLE_WITH_CUDA
   auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
   if (cuda_dev_ctx) {
@@ -72,11 +74,11 @@ std::string Event::kind() const {
   PADDLE_THROW("Unknown EventKind.");
 }
 
-double Event::CpuElapsedUs(const Event& e) const {
-  return (e.cpu_ns_ - cpu_ns_) / (1000.0);
+double Event::CpuElapsedMs(const Event& e) const {
+  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
 }
 
-double Event::CudaElapsedUs(const Event& e) const {
+double Event::CudaElapsedMs(const Event& e) const {
 #ifdef PADDLE_WITH_CUDA
   PADDLE_ENFORCE(e.has_cuda() && has_cuda());
   PADDLE_ENFORCE(e.device() == device());
@@ -84,7 +86,7 @@ double Event::CudaElapsedUs(const Event& e) const {
   PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
   float ms;
   PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
-  return ms * 1000.0;
+  return ms;
 #else
   PADDLE_THROW("CUDA is not enabled");
 #endif
@@ -113,21 +115,27 @@ inline EventList& GetEventList() {
 }
 
 void Mark(const std::string& name, DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id,
-                        dev_ctx);
+  GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
+}
+
+void PushEvent(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
+}
+
+void PopEvent(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
 }
 
 RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
   if (g_state == ProfilerState::kDisabled) return;
   dev_ctx_ = dev_ctx;
-  GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
-                        dev_ctx_);
+  name_ = name;
+  PushEvent(name_, dev_ctx_);
 }
 
 RecordEvent::~RecordEvent() {
   if (g_state == ProfilerState::kDisabled) return;
-  GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id,
-                        dev_ctx_);
+  PopEvent(name_, dev_ctx_);
 }
 
 void EnableProfiler(ProfilerState state) {
@@ -138,6 +146,7 @@ void EnableProfiler(ProfilerState state) {
                  "The profiling state should be disabled when calling ",
                  "EnableProfiler.");
   g_state = state;
+  g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU";
 #ifdef PADDLE_WITH_CUDA
   if (g_state == ProfilerState::kCUDA) {
     // Generate some dummy evenets first to reduce the startup overhead.
@@ -169,5 +178,152 @@ std::vector<std::vector<Event>> DisableProfiler() {
   return result;
 }
 
+void ParseEvents(std::vector<std::vector<Event>>& events,
+                 EventSortingKey sorted_by) {
+  if (g_profiler_place == "") return;
+
+  std::string sorted_domain;
+  std::function<bool(EventItem&, EventItem&)> sorted_func;
+  switch (sorted_by) {
+    case EventSortingKey::kCalls:
+      sorted_domain = "number of calls";
+      sorted_func = [](EventItem& a, EventItem& b) {
+        return a.calls > b.calls;
+      };
+      break;
+    case EventSortingKey::kTotal:
+      sorted_domain = "total time";
+      sorted_func = [](EventItem& a, EventItem& b) {
+        return a.total_time > b.total_time;
+      };
+      break;
+    case EventSortingKey::kMin:
+      sorted_domain = "minimum time";
+      sorted_func = [](EventItem& a, EventItem& b) {
+        return a.min_time > b.min_time;
+      };
+      break;
+    case EventSortingKey::kMax:
+      sorted_domain = "maximum time";
+      sorted_func = [](EventItem& a, EventItem& b) {
+        return a.max_time > b.max_time;
+      };
+      break;
+    case EventSortingKey::kAve:
+      sorted_domain = "average time";
+      sorted_func = [](EventItem& a, EventItem& b) {
+        return a.ave_time > b.ave_time;
+      };
+      break;
+    default:
+      sorted_domain = "event end time";
+  }
+
+  std::vector<std::vector<EventItem>> events_table;
+  size_t max_name_width = 0;
+  for (size_t i = 0; i < events.size(); i++) {
+    std::list<Event> pushed_events;
+    std::vector<EventItem> event_items;
+    std::unordered_map<std::string, int> event_idx;
+
+    for (size_t j = 0; j < events[i].size(); j++) {
+      if (events[i][j].kind() == "push") {
+        pushed_events.push_back(events[i][j]);
+      } else if (events[i][j].kind() == "pop") {
+        std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
+        while (rit != pushed_events.rend() &&
+               rit->name() != events[i][j].name()) {
+          ++rit;
+        }
+
+        if (rit != pushed_events.rend()) {
+          double event_time = (g_profiler_place == "CUDA")
+                                  ? rit->CudaElapsedMs(events[i][j])
+                                  : rit->CpuElapsedMs(events[i][j]);
+          std::string event_name =
+              "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
+          max_name_width = std::max(max_name_width, event_name.size());
+
+          if (event_idx.find(event_name) == event_idx.end()) {
+            event_idx[event_name] = event_items.size();
+            EventItem event_item = {event_name, 1,          event_time,
+                                    event_time, event_time, event_time};
+            event_items.push_back(event_item);
+          } else {
+            int index = event_idx[event_name];
+            event_items[index].calls += 1;
+            // total time
+            event_items[index].total_time += event_time;
+            // min time
+            event_items[index].min_time =
+                std::min(event_time, event_items[index].min_time);
+            // max time
+            event_items[index].max_time =
+                std::max(event_time, event_items[index].max_time);
+          }
+
+          // remove the push marker from the list
+          pushed_events.erase((++rit).base());
+        } else {
+          LOG(WARNING) << "Cannot find the push marker of event \'"
+                       << events[i][j].name()
+                       << "\', which will be ignored in profiling report.";
+        }
+      }
+    }
+    // average time
+    for (auto& item : event_items) {
+      item.ave_time = item.total_time / item.calls;
+    }
+    // sort
+    if (sorted_by != EventSortingKey::kDefault) {
+      std::sort(event_items.begin(), event_items.end(), sorted_func);
+    }
+
+    events_table.push_back(event_items);
+    // log warning if there are events with `push` but without `pop`
+    std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
+    while (rit != pushed_events.rend()) {
+      LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
+                   << "\', which will be ignored in profiling report.";
+      ++rit;
+    }
+  }
+
+  // Print report
+  PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12);
+}
+
+void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
+                          std::string& sorted_domain, const size_t name_width,
+                          const size_t data_width) {
+  // Output header information
+  std::cout << "\n------------------------->"
+            << "     Profiling Report     "
+            << "<-------------------------\n\n";
+  std::cout << "Place: " << g_profiler_place << std::endl;
+  std::cout << "Time unit: ms" << std::endl;
+  std::cout << "Sorted by " << sorted_domain
+            << " in descending order in the same thread\n\n";
+  // Output events table
+  std::cout.setf(std::ios::left);
+  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
+            << "Calls" << std::setw(data_width) << "Total"
+            << std::setw(data_width) << "Min." << std::setw(data_width)
+            << "Max." << std::setw(data_width) << "Ave." << std::endl;
+  for (size_t i = 0; i < events_table.size(); ++i) {
+    for (size_t j = 0; j < events_table[i].size(); ++j) {
+      EventItem& event_item = events_table[i][j];
+      std::cout << std::setw(name_width) << event_item.name
+                << std::setw(data_width) << event_item.calls
+                << std::setw(data_width) << event_item.total_time
+                << std::setw(data_width) << event_item.min_time
+                << std::setw(data_width) << event_item.max_time
+                << std::setw(data_width) << event_item.ave_time << std::endl;
+    }
+  }
+  std::cout << std::endl;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h
index 47104ea9d08dad373888c0af463d2b1dbe72a269..6df48ef8806e865f473b4317ac0283863c3c6f64 100644
--- a/paddle/platform/profiler.h
+++ b/paddle/platform/profiler.h
@@ -33,6 +33,7 @@ class Event {
 
   std::string kind() const;
   std::string name() const { return name_; }
+  uint32_t thread_id() const { return thread_id_; }
   bool has_cuda() const { return has_cuda_; }
 
 #ifdef PADDLE_WITH_CUDA
@@ -40,8 +41,8 @@ class Event {
   int device() const { return device_; }
 #endif
 
-  double CpuElapsedUs(const Event& e) const;
-  double CudaElapsedUs(const Event& e) const;
+  double CpuElapsedMs(const Event& e) const;
+  double CudaElapsedMs(const Event& e) const;
 
  private:
   EventKind kind_;
@@ -94,6 +95,10 @@ enum ProfilerState {
 
 void Mark(const std::string& name, DeviceContext* dev_ctx);
 
+void PushEvent(const std::string& name, DeviceContext* dev_ctx);
+
+void PopEvent(const std::string& name, DeviceContext* dev_ctx);
+
 struct RecordEvent {
   explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
 
@@ -101,6 +106,8 @@ struct RecordEvent {
 
   // The device context is used by Event to get the current cuda stream.
   DeviceContext* dev_ctx_;
+  // Event name
+  std::string name_;
 };
 
 // Enable the profiling function.
@@ -110,5 +117,26 @@ void EnableProfiler(ProfilerState state);
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> DisableProfiler();
 
+// The information of each event given in the profiling report
+struct EventItem {
+  std::string name;
+  int calls;
+  double total_time;
+  double min_time;
+  double max_time;
+  double ave_time;
+};
+
+// Candidate keys to sort the profiling report
+enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
+
+// Parse the event list and output the profiling report
+void ParseEvents(std::vector<std::vector<Event>>&,
+                 EventSortingKey sorted_by = EventSortingKey::kDefault);
+
+// Print results
+void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
+                          std::string& sorted_domain, const size_t name_width,
+                          const size_t data_width);
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc
index 47cf7be146121c54300df928fe329268ed975373..13dea713c71e147ed5dd8d090e92d86c96256c09 100644
--- a/paddle/platform/profiler_test.cc
+++ b/paddle/platform/profiler_test.cc
@@ -26,7 +26,7 @@ TEST(Event, CpuElapsedTime) {
     counter++;
   }
   Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
-  EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0);
+  EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
 }
 
 #ifdef PADDLE_WITH_CUDA
@@ -45,7 +45,7 @@ TEST(Event, CudaElapsedTime) {
     counter++;
   }
   Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
-  EXPECT_GT(start_event.CudaElapsedUs(stop_event), 0);
+  EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
 }
 #endif
 
@@ -55,6 +55,7 @@ TEST(RecordEvent, RecordEvent) {
   using paddle::platform::EventKind;
   using paddle::platform::RecordEvent;
   using paddle::platform::ProfilerState;
+  using paddle::platform::EventSortingKey;
 
   ProfilerState state = ProfilerState::kCPU;
   DeviceContext* dev_ctx = nullptr;
@@ -67,13 +68,45 @@ TEST(RecordEvent, RecordEvent) {
 #endif
   EnableProfiler(state);
 
+  /* Usage 1:
+  *  PushEvent(evt_name, dev_ctx);
+  *  ...
+  *  code to be analyzed
+  *  ...
+  * PopEvent(evt_name, dev_ctx);
+  */
+  for (int loop = 0; loop < 3; ++loop) {
+    for (int i = 1; i < 5; ++i) {
+      std::string name = "op_" + std::to_string(i);
+      PushEvent(name, dev_ctx);
+      int counter = 1;
+      while (counter != i * 1000) counter++;
+      PopEvent(name, dev_ctx);
+    }
+  }
+
+  /* Usage 2:
+   * {
+   *   RecordEvent record_event(name, dev_ctx);
+   *   ...
+   *   code to be analyzed
+   *   ...
+   * }
+   */
   for (int i = 1; i < 5; ++i) {
-    std::string name = "op_" + std::to_string(i);
+    std::string name = "evs_op_" + std::to_string(i);
     RecordEvent record_event(name, dev_ctx);
     int counter = 1;
     while (counter != i * 1000) counter++;
   }
+
+  // Bad Usage:
+  PushEvent("event_without_pop", dev_ctx);
+  PopEvent("event_without_push", dev_ctx);
   std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler();
+  // Will remove parsing-related code from test later
+  ParseEvents(events, EventSortingKey::kTotal);
+
   int cuda_startup_count = 0;
   int start_profiler_count = 0;
   int stop_profiler_count = 0;
@@ -85,9 +118,9 @@ TEST(RecordEvent, RecordEvent) {
       if (events[i][j].name() == "push") {
         EXPECT_EQ(events[i][j + 1].name(), "pop");
 #ifdef PADDLE_WITH_CUDA
-        EXPECT_GT(events[i][j].CudaElapsedUs(events[i][j + 1]), 0);
+        EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
 #else
-        EXPECT_GT(events[i][j].CpuElapsedUs(events[i][j + 1]), 0);
+        EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
 #endif
       }
     }
diff --git a/paddle/pybind/const_value.cc b/paddle/pybind/const_value.cc
index 761635aa5e5eac445c2ec8331b0dc37ffd11248c..b13ad42ea29453354798d88bff8ef47339d1a614 100644
--- a/paddle/pybind/const_value.cc
+++ b/paddle/pybind/const_value.cc
@@ -23,11 +23,6 @@ void BindConstValue(pybind11::module& m) {
   m.def("kTempVarName", [] { return framework::kTempVarName; });
   m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
   m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
-
-  // for kernel_hint key
-  m.def("kUseCPU", [] { return framework::kUseCPU; });
-  m.def("kUseCUDNN", [] { return framework::kUseCUDNN; });
-  m.def("kUseMKLDNN", [] { return framework::kUseMKLDNN; });
 }
 
 }  // namespace pybind
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 07292d47e9c165c67fe4a30ee7d851c350beb2e0..564a3700011c1b0c294c27af4b86ab967a0f0e1e 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -216,7 +216,7 @@ void BindVarDsec(py::module &m) {
       .def("set_dtype", &VarDesc::SetDataType)
       .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
       .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
-      .def("lod_level", &VarDesc::GetLodLevel)
+      .def("lod_level", &VarDesc::GetLoDLevel)
       .def("set_lod_level", &VarDesc::SetLoDLevel)
       .def("type", &VarDesc::GetType)
       .def("set_type", &VarDesc::SetType)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 04485ce7c1ab87f8655b0e6cbaecc36b3382f647..5d170c66e97f56440968ba568167e6845631e1cc 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -427,8 +427,15 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("unique_integer", UniqueIntegerGenerator);
   m.def("init_gflags", framework::InitGflags);
+  m.def("init_glog", framework::InitGLOG);
   m.def("init_devices", &framework::InitDevices);
 
+  m.def("use_cpu", framework::UseCPU);
+  m.def("use_mkldnn", framework::UseMKLDNN);
+  m.def("use_cuda", framework::UseCUDA);
+  m.def("use_cudnn", framework::UseCUDNN);
+  m.def("use_all", framework::UseALL);
+
   m.def("is_compile_gpu", IsCompileGPU);
   m.def("set_feed_variable", framework::SetFeedVariable);
   m.def("get_fetch_variable", framework::GetFetchVariable);
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 92039ec6b05d224e702f0ba5dc05c057a492287e..e70d04d9017e9e36bbd55d6a28889d9ba7fb2a13 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -193,6 +193,16 @@ EOF
 EOF
 }
 
+function gen_capi_package() {
+  if [[ ${WITH_C_API} == "ON" ]]; then
+    install_prefix="/paddle/build/capi_output"
+    rm -rf $install_prefix
+    make DESTDIR="$install_prefix" install
+    cd $install_prefix/usr/local
+    ls | egrep -v "^Found.*item$" | xargs tar -cf /paddle/build/paddle.tgz
+  fi
+}
+
 set -xe
 
 cmake_gen ${PYTHON_ABI:-""}
@@ -200,6 +210,11 @@ run_build
 run_test
 gen_docs
 gen_dockerfile
-
-printf "If you need to install PaddlePaddle in develop docker image,"
-printf "please make install or pip install build/python/dist/*.whl.\n"
+gen_capi_package
+
+if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
+  printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n" 
+else
+  printf "If you need to install PaddlePaddle in develop docker image,"
+  printf "please make install or pip install build/python/dist/*.whl.\n"
+fi
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
index a0a365aa0bb0ac26939a02c1cd626d0c17c6a9fe..2b68d89e48a3efd5de205ce33643b7e6320a4303 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -29,6 +29,7 @@ DECLARE_bool(with_gpu);
 DECLARE_bool(parallel_nn);
 DECLARE_string(config_args);
 DECLARE_bool(use_mkldnn);
+DECLARE_bool(use_mkl_packed);
 
 const char *kConfigParserModuleName = "paddle.trainer.config_parser";
 const char *kConfigParserFuncName = "parse_config_and_serialize";
@@ -46,6 +47,7 @@ TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
              << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu
              << ",parallel_nn=" << FLAGS_parallel_nn
              << ",use_mkldnn=" << FLAGS_use_mkldnn
+             << ",use_mkl_packed=" << FLAGS_use_mkl_packed
              << ",cudnn_version=" << hl_get_cudnn_lib_version();
   if (!FLAGS_config_args.empty()) {
     configArgs << "," << FLAGS_config_args;
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index 9a7dc0e35622383a190f8b3a80736e6b42c9c959..ea47cf23eb6e56082eeb92f3c6dff8d03be0d679 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -27,6 +27,13 @@ DEFINE_bool(use_mkldnn, false, "Default still keep use CPU training");
 DEFINE_bool(use_mkldnn, false, "Only support CPU training");
 #endif
 
+#ifdef PADDLE_WITH_MKLML
+// TODO(TJ): change to true when fully confirmed
+DEFINE_bool(use_mkl_packed, false, "Whether to use MKL Packed Optimization");
+#else
+DEFINE_bool(use_mkl_packed, false, "Not to use MKL Packed Optimization");
+#endif
+
 DEFINE_bool(parallel_nn,
             false,
             "Whether to use multi-threads to calculate one neural network."
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
index 1832bb515ec85df3d7733e01b063a01ad6a3b282..b64295bca09a199f24605a158d1d9db7e7d91660 100644
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -41,3 +41,4 @@ DECLARE_string(predict_file);
 DECLARE_bool(prev_batch_state);
 DECLARE_string(init_model_path);
 DECLARE_bool(use_mkldnn);
+DECLARE_bool(use_mkl_packed);
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 6f589e916979584897863db72924c885c258d4b2..36919ab00bf6fc1f9aee350074e5532bfdc7d45e 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -29,8 +29,8 @@ if(WITH_MKLML)
 endif()
 
 if(WITH_MKLDNN)
-  list(APPEND MKL_SHARED_LIBS "${MKLDNN_LIB}" "${MKLDNN_LIB}.0")
-  list(APPEND MKL_DEPENDS mkldnn)
+  list(APPEND MKL_SHARED_LIBS "${MKLDNN_SHARED_LIB}")
+  list(APPEND MKL_DEPENDS mkldnn mkldnn_shared_lib)
 endif()
 
 if(WITH_GPU)
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 239fe4204b20a37a0869ba1e0e99adf4293dac7e..4fdf4090212e31adcccf6b119c937e70d5cbf995 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -3622,8 +3622,13 @@ class ConcatenateLayer2(LayerBase):
 
 @config_layer('recurrent')
 class RecurrentLayer(LayerBase):
+    layer_type = 'recurrent'
+
     def __init__(self, name, inputs, reversed=False, bias=True, **xargs):
-        super(RecurrentLayer, self).__init__(name, 'recurrent', 0, inputs,
+        use_mkl_packed = bool(
+            int(g_command_config_args.get("use_mkl_packed", 0)))
+        self.layer_type = 'mkl_packed_recurrent' if use_mkl_packed else 'recurrent'
+        super(RecurrentLayer, self).__init__(name, self.layer_type, 0, inputs,
                                              **xargs)
         config_assert(len(self.inputs) == 1, 'RecurrentLayer must have 1 input')
         input_layer = self.get_input_layer(0)
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index ecba87191045cff6c05014010e60575741238f8d..e6f87ce61b1d16d4f98f111626776aa52c2ec35b 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -58,12 +58,12 @@ def is_compatible_with(x, Type):
 
 class HookAttribute(object):
     """
-    Hook Attribute object. As a member of ParameterAttribute class, the hook is an auxiliary operation that occurs 
+    Hook Attribute object. As a member of ParameterAttribute class, the hook is an auxiliary operation that occurs
     during training process of a layer with parameters, such as img_conv layer, fc layer.
 
-    :param  type: Hook type, currently supported types: 
+    :param  type: Hook type, currently supported types:
                         'pruning' :  user specify a sparsity_ratio before training started, and the
-                            network will prune the parameters based on the sparsity_ratio. 
+                            network will prune the parameters based on the sparsity_ratio.
                             eg: The definition of Hook object can be hk = HookAttribute('pruning', 0.6)
                             The specific usage can be paddle.layer.img_conv(input=img, filter_size=3,
                                                                        num_channels=3, num_filters=64,
@@ -71,10 +71,10 @@ class HookAttribute(object):
                             The pruning details can be found https://arxiv.org/pdf/1506.02626.pdf
     :type type: string
 
-    :param sparsity_ratio: Must be specified if hook type is 'pruning', 
+    :param sparsity_ratio: Must be specified if hook type is 'pruning',
                         it represents the ratio of the zero elements to be set by the Parameter.
     :type sparsity_ratio: float or None
-	
+
     """
 
     def __init__(self, type, sparsity_ratio=None):
@@ -130,10 +130,12 @@ class ParameterAttribute(object):
     :param sparse_update: Enable sparse update for this parameter. It will
                           enable both local and remote sparse update.
     :type sparse_update: bool
+    :param update_hooks: A HookAttribute object.
+    :type update_hooks: HookAttribute
     :param initializer: If not None, it should be a callable object which accepts
                         a parameter name and returns numpy array for the initial
                         value of the parameter
-    :param initializer: callable object
+    :type initializer: callable object
     """
 
     def __init__(self,
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 70f61e84997efdbe3d6f268d249be8bac15b9ecd..0de417df2cb942ce46ff8ac3acc61ae4999ed634 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -135,6 +135,8 @@ def init(**kwargs):
         cp.g_command_config_args['use_gpu'] = kwargs['use_gpu']
     if 'use_mkldnn' in kwargs:
         cp.g_command_config_args['use_mkldnn'] = kwargs['use_mkldnn']
+    if 'use_mkl_packed' in kwargs:
+        cp.g_command_config_args['use_mkl_packed'] = kwargs['use_mkl_packed']
     assert 'parallel_nn' not in kwargs, ("currently 'parallel_nn' is not "
                                          "supported in v2 APIs.")
 
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index 225b41c5043b5792abb90bbad53cbbfce9a3156e..5e01b8719806f4bb0c0d985373a5f4b076e05bd5 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 # import all class inside framework into fluid module
 import framework
 from framework import *
@@ -27,7 +28,7 @@ __all__ = framework.__all__ + executor.__all__ + [
 ]
 
 
-def __read_gflags_from_env__():
+def __bootstrap__():
     """
     Enable reading gflags from environment variables.
 
@@ -36,11 +37,30 @@ def __read_gflags_from_env__():
     """
     import sys
     import core
+    import os
+
+    try:
+        num_threads = int(os.getenv('OMP_NUM_THREADS', '1'))
+    except ValueError:
+        num_threads = 1
+
+    if num_threads > 1:
+        print(
+            'WARNING: OMP_NUM_THREADS set to {0}, not 1. The computation '
+            'speed will not be optimized if you use data parallel. It will '
+            'fail if this PaddlePaddle binary is compiled with OpenBlas since'
+            ' OpenBlas does not support multi-threads.'.format(num_threads),
+            file=sys.stderr)
+        print('PLEASE USE OMP_NUM_THREADS WISELY.', file=sys.stderr)
+
+    os.environ['OMP_NUM_THREADS'] = str(num_threads)
+
     read_env_flags = ['use_pinned_memory', 'check_nan_inf']
     if core.is_compile_gpu():
         read_env_flags.append('fraction_of_gpu_memory_to_use')
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
+    core.init_glog(sys.argv[0])
 
     if core.is_compile_gpu():
         core.init_devices(["CPU", "GPU:0"])
@@ -48,4 +68,4 @@ def __read_gflags_from_env__():
         core.init_devices(["CPU"])
 
 
-__read_gflags_from_env__()
+__bootstrap__()
diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py
index ac60bf543600008fd5339c1a378951374afc4ad6..66a7f737574c438a2b945bd4a49d8317bd460c80 100644
--- a/python/paddle/v2/fluid/backward.py
+++ b/python/paddle/v2/fluid/backward.py
@@ -7,7 +7,7 @@ __all__ = ['append_backward']
 
 def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
     """
-    Traverse all ops in op_descs[begin_idx : end_idx], 
+    Traverse all ops in op_descs[begin_idx : end_idx],
     if any op has inputs/outputs named "old_name", rename it as 'new_name'
     """
     if begin_idx is None:
@@ -162,7 +162,7 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
             if core.grad_var_suffix() in arg and arg in no_grad_set:
                 to_insert.append((_create_op_desc_("fill_zeros_like", {
                     "X": [_strip_grad_suffix_(arg)]
-                }, {"Y": [arg]}, {}), idx))
+                }, {"Out": [arg]}, {}), idx))
 
     map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert))
 
@@ -182,7 +182,7 @@ def _append_backward_ops_(target,
         target(Variable): the target variable of forward pass
         block(Block): the block where forward ops are
         target_block(Block): the block which is going to hold new generated grad ops
-        no_grad_dict(dict): 
+        no_grad_dict(dict):
             key(int)  block index
             val(set) a set of varibale names. These varibales have no gradient
         grad_to_var(dict)(output argument):
@@ -205,6 +205,7 @@ def _append_backward_ops_(target,
         # Getting op's corresponding grad_op
         grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
             op.desc, no_grad_dict[block.idx], grad_sub_block_list)
+
         grad_op_descs.extend(grad_op_desc)
         grad_to_var.update(op_grad_to_var)
 
@@ -275,8 +276,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None):
         loss(Variable): The variable generated by cost function.
         parameter_list(list): Parameters that need to be updated by optimizer.
             If None, it means all parameters need to be updated.
-        no_grad_set(set): Variables that have no gradients in Block 0. 
-            If None, the set will be generated inside the function and 
+        no_grad_set(set): Variables that have no gradients in Block 0.
+            If None, the set will be generated inside the function and
             contains all variables with `step_gradient=True` from all blocks.
 
     Return:
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index 1d6c594b41a2c295e3818fb119362d1daba1de33..1b2075dcd5ece5706e62431b360d4dc86ea57a89 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -65,13 +65,6 @@ class Executor(object):
             p.set_place(each)
             act_places.append(p)
 
-        # TODO(dzhwinter) : consider that our fluid tests all written in 
-        # CUDAPlace(gpu_id), this will be changed in the future
-        if core.is_compile_gpu():
-            core.init_devices(["CPU", "GPU:0"])
-        else:
-            core.init_devices(["CPU"])
-
         # TODO(dzhwinter) : only use the first place
         self.executor = core.Executor(act_places[0])
         self.places = places
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index b66a8bce5f4f15539007876c113afd3f878b00bc..2dfb8b624160499492b7247f0e1dd1fba793817e 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -17,10 +17,6 @@ TEMP_VAR_NAME = core.kTempVarName()
 GRAD_VAR_SUFFIX = core.kGradVarSuffix()
 ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 
-USE_CPU = core.kUseCPU()
-USE_CUDNN = core.kUseMKLDNN()
-USE_MKLDNN = core.kUseMKLDNN()
-
 
 def grad_var_name(var_name):
     """
@@ -452,7 +448,7 @@ class Operator(object):
         no_kernel_op_set = {
             'feed', 'fetch', 'save', 'load', 'recurrent',
             'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
-            'recv'
+            'recv', 'parallel_do'
         }
         if type not in no_kernel_op_set:
             self.desc.infer_var_type(self.block.desc)
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py
index c47ce82aba7fa5ac42ac26cd25fa3ebc93e96cb2..c63567601accd8c072368351f2838857bb61c818 100644
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -36,7 +36,7 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     :param executor: executor that save variable
     :param dirname: directory path
     :param main_program: program. If vars is None, then filter all variables in this
-    program which fit `predicate`. Default g_program.
+    program which fit `predicate`. Default default_main_program.
     :param predicate: The Predicate describes a callable that returns a variable
     as a bool. If it returns true, the variables will be saved.
     :param vars: variables need to be saved. If specify vars, program & predicate
@@ -212,6 +212,11 @@ def save_inference_model(dirname,
             "fetch_var_names": fetch_var_names
         }, f, -1)
 
+    # Save only programDesc of inference_program in binary format
+    # in another file: __model__.dat
+    with open(model_file_name + ".dat", "wb") as fp:
+        fp.write(inference_program.desc.serialize_to_string())
+
     save_params(executor, dirname, main_program)
 
 
diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py
index acc22bef98b6eac4291bb2181e6d5cd7dbe2a768..9ad021fa992e5e8dbfebe96cf40ae602b0ed99b5 100644
--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -6,12 +6,13 @@ import contextlib
 from ..registry import autodoc
 
 __all__ = [
-    'split_lod_tensor', 'merge_lod_tensor', 'BlockGuard', 'StaticRNNGuard',
-    'StaticRNNMemoryLink', 'WhileGuard', 'While', 'lod_rank_table',
-    'max_sequence_len', 'topk', 'lod_tensor_to_array', 'array_to_lod_tensor',
-    'increment', 'array_write', 'create_array', 'less_than', 'array_read',
-    'shrink_memory', 'array_length', 'IfElse', 'DynamicRNN', 'ConditionalBlock',
-    'StaticRNN', 'reorder_lod_tensor_by_rank'
+    'split_lod_tensor', 'merge_lod_tensor', 'BlockGuard',
+    'BlockGuardWithCompletion', 'StaticRNNMemoryLink', 'WhileGuard', 'While',
+    'lod_rank_table', 'max_sequence_len', 'topk', 'lod_tensor_to_array',
+    'array_to_lod_tensor', 'increment', 'array_write', 'create_array',
+    'less_than', 'array_read', 'shrink_memory', 'array_length', 'IfElse',
+    'DynamicRNN', 'ConditionalBlock', 'StaticRNN', 'reorder_lod_tensor_by_rank',
+    'ParallelDo'
 ]
 
 
@@ -132,29 +133,129 @@ class BlockGuard(object):
         return True
 
 
-class StaticRNNGuard(BlockGuard):
+class ParallelDo(object):
     """
-    StaticRNNGuard class.
+    ParallelDo class.
 
-    StaticRNNGuard class is used to create a StaticRNN block in a program.
+    ParallelDo class is used to create a ParallelDo.
+    """
+
+    def __init__(self, places, name=None):
+        self.helper = LayerHelper("parallel_do", name=name)
+        self.inputs = []
+        self.places = places
+        self.outputs = []
+        self.status = StaticRNN.BEFORE_RNN_BLOCK
+
+    def do(self):
+        return BlockGuardWithCompletion(self)
+
+    def parent_block(self):
+        prog = self.helper.main_program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+        return parent_block
+
+    def __call__(self, *args, **kwargs):
+        if self.status != StaticRNN.AFTER_RNN_BLOCK:
+            raise ValueError("RNN output can only be retrieved after rnn block")
+        if len(self.outputs) == 0:
+            raise ValueError("RNN has no output")
+        elif len(self.outputs) == 1:
+            return self.outputs[0]
+        else:
+            return self.outputs
+
+    def read_input(self, var):
+        self.inputs.append(var)
+        return var
+
+    def write_output(self, var):
+        self.outputs.append(var)
+
+    def get_parameters(self):
+        main_program = self.helper.main_program
+        current_block = main_program.current_block()
+        parent_block = self.parent_block()
+
+        local_inputs = set()
+
+        for op in current_block.ops:
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    local_inputs.add(out_var_name)
+
+        for var in self.inputs:
+            local_inputs.add(var.name)
+
+        params = list()
+        for op in current_block.ops:
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in local_inputs:
+                        params.append(in_var_name)
+
+        return [parent_block.var(name) for name in params]
+
+    def complete_op(self):
+        main_program = self.helper.main_program
+        current_block = main_program.current_block()
+        parent_block = self.parent_block()
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        self.outputs = [
+            parent_block.create_var(
+                name=o.name,
+                shape=o.shape,
+                dtype=o.dtype,
+                lod_level=o.lod_level,
+                persistable=o.persistable,
+                stop_gradient=o.stop_gradient) for o in self.outputs
+        ]
+
+        inputs = [parent_block.var(i.name) for i in self.inputs]
+        outputs = [parent_block.var(o.name) for o in self.outputs]
+
+        parent_block.append_op(
+            type='parallel_do',
+            inputs={
+                'inputs': inputs,
+                'parameters': self.get_parameters(),
+                'places': self.places
+            },
+            outputs={'outputs': outputs,
+                     'parallel_scopes': [step_scope]},
+            attrs={'sub_block': current_block})
+
+
+class BlockGuardWithCompletion(BlockGuard):
+    """
+    BlockGuardWithCompletion class.
+
+    BlockGuardWithCompletion class is used to create an op with a block in a program.
     """
 
     def __init__(self, rnn):
-        if not isinstance(rnn, StaticRNN):
-            raise TypeError("StaticRNNGuard takes a StaticRNN")
-        super(StaticRNNGuard, self).__init__(rnn.helper.main_program)
+        if not (isinstance(rnn, StaticRNN) or isinstance(rnn, ParallelDo)):
+            raise TypeError(
+                "BlockGuardWithCompletion takes a StaticRNN or ParallelDo")
+        super(BlockGuardWithCompletion, self).__init__(rnn.helper.main_program)
         self.rnn = rnn
 
     def __enter__(self):
         self.rnn.status = StaticRNN.IN_RNN_BLOCK
-        return super(StaticRNNGuard, self).__enter__()
+        return super(BlockGuardWithCompletion, self).__enter__()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         if exc_type is not None:
             return False
         self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
-        self.rnn.complete_rnn_op()
-        return super(StaticRNNGuard, self).__exit__(exc_type, exc_val, exc_tb)
+        self.rnn.complete_op()
+        return super(BlockGuardWithCompletion, self).__exit__(exc_type, exc_val,
+                                                              exc_tb)
 
 
 class StaticRNNMemoryLink(object):
@@ -200,7 +301,7 @@ class StaticRNN(object):
         self.seq_len = None
 
     def step(self):
-        return StaticRNNGuard(self)
+        return BlockGuardWithCompletion(self)
 
     def _assert_in_rnn_block_(self, method):
         if self.status != StaticRNN.IN_RNN_BLOCK:
@@ -316,7 +417,7 @@ class StaticRNN(object):
         else:
             return self.outputs
 
-    def complete_rnn_op(self):
+    def complete_op(self):
         main_program = self.helper.main_program
         rnn_block = main_program.current_block()
         parent_block = self.parent_block()
@@ -464,7 +565,7 @@ def lod_rank_table(x, level=0):
     """LoD Rank Table Operator. Given an input variable **x** and a level number
     of LoD, this layer creates a LodRankTable object. A LoDRankTable object
     contains a list of bi-element tuples. Each tuple consists of an index and
-    a length, both of which are int type. Reffering to specified level of LoD,
+    a length, both of which are int type. Refering to specified level of LoD,
     the index is the sequence index number and the length representes the
     sequence length. Please note that the list is ranked in descending order by
     the length. The following is an example:
@@ -897,7 +998,7 @@ class ConditionalBlock(object):
 
         out_list = [
             parent_block.var(var_name) for var_name in parent_block.vars
-            if var_name not in intermediate
+            if var_name in intermediate
         ]
 
         step_scope = parent_block.create_var(
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 55d8bf8a8a60a832000f7119a8bc039127ab1f3a..7feb479d2e3bc396eac53045888175ba54864b66 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -64,14 +64,14 @@ def fc(input,
                               is flattened: the first `num_flatten_dims`
                               dimensions will be flatten to form the first
                               dimension of the final matrix (height of the
-                              matrix), and the rest `rank(X) - num_col_dims`
+                              matrix), and the rest `rank(X) - num_flatten_dims`
                               dimensions are flattened to form the second
                               dimension of the final matrix (width of the matrix).
                               For example, suppose `X` is a 6-dimensional tensor
                               with a shape [2, 3, 4, 5, 6], and
-                              `x_num_col_dims` = 3. Then, the flattened matrix
+                              `num_flatten_dims` = 3. Then, the flattened matrix
                               will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
-                              By default, `x_num_col_dims` is set to 1.
+                              By default, `num_flatten_dims` is set to 1.
        param_attr(ParamAttr|list): The parameter attribute for learnable
                                    parameters/weights of the fully connected
                                    layer.
@@ -151,7 +151,7 @@ def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'):
 
     Args:
        input(Variable): Input to the function
-       size(tuple|list|None): Shape of the look up table parameter 
+       size(tuple|list|None): Shape of the look up table parameter
        is_sparse(bool): Boolean flag that specifying whether the input is sparse
        param_attr(ParamAttr): Parameters for this layer
        dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc
@@ -236,21 +236,50 @@ def gru_unit(input,
              activation='tanh',
              gate_activation='sigmoid'):
     """
-    GRUUnit Operator implements partial calculations of the GRU unit as following:
+    GRU unit layer. The equation of a gru step is:
 
-    $$
-    update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
-    reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
-    output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
-    output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
-    $$
+        .. math::
+            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
+
+            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
+
+            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
 
-    which is same as one time step of GRU Operator.
+            h_t & = dot((1-u_t), m_t) + dot(u_t, h_{t-1})
 
-    @note To implement the complete GRU unit, fully-connected operator must be
-    used before to feed xu, xr and xc as the Input of GRUUnit operator.
+    The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
+    of the equation above, the :math:`z_t` is split into 3 parts - 
+    :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to 
+    implement a full GRU unit operator for an input, a fully 
+    connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.
+
+    The terms :math:`u_t` and :math:`r_t` represent the update and reset gates 
+    of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is 
+    an intermediate candidate hidden output, which is denoted by :math:`m_t`.
+    This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
+    and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
+
+    Args:
+        input (Variable): The fc transformed input value of current step.
+        hidden (Variable): The hidden value of lstm unit from previous step.
+        size (integer): The input dimension value.
+        weight (ParamAttr): The weight parameters for gru unit. Default: None
+        bias (ParamAttr): The bias parameters for gru unit. Default: None
+        activation (string): The activation type for cell (actNode). Default: 'tanh'
+        gate_activation (string): The activation type for gates (actGate). Default: 'sigmoid'
+
+    Returns:
+        tuple: The hidden value, reset-hidden value and gate values.
+
+    Examples:
+
+        .. code-block:: python
+
+             # assuming we have x_t_data and prev_hidden of size=10
+             x_t = fluid.layers.fc(input=x_t_data, size=30) 
+             hidden_val, r_h_val, gate_val = fluid.layers.gru_unit(input=x_t,
+                                                    hidden = prev_hidden)
 
-    TODO(ChunweiYan) add more document here
     """
     activation_dict = dict(
         identity=0,
@@ -366,9 +395,9 @@ def cross_entropy(input, label, **kwargs):
 
     1) One-hot cross-entropy:
 	`soft_label = False`, `Label[i, 0]` indicates the class index for sample i:
-        
+
         .. math::
-          
+
             Y[i] = -\log(X[i, Label[i]])
 
     2) Soft-label cross-entropy:
@@ -386,15 +415,15 @@ def cross_entropy(input, label, **kwargs):
 	 As a special case of 2), when each row of 'label' has only one
 	 non-zero element which is equal to 1, soft-label cross-entropy degenerates
          to a one-hot cross-entropy with one-hot label representation.
-    
+
     Args:
-        input (Variable|list):  a 2-D tensor with shape [N x D], where N is the 
-            batch size and D is the number of classes. This input is a probability 
+        input (Variable|list):  a 2-D tensor with shape [N x D], where N is the
+            batch size and D is the number of classes. This input is a probability
             computed by the previous operator, which is almost always the result
             of a softmax operator.
-        label (Variable|list): the ground truth which is a 2-D tensor. When 
-              `soft_label` is set to `False`, `label` is a tensor<int64> with shape 
-              [N x 1]. When `soft_label` is set to `True`, `label` is a 
+        label (Variable|list): the ground truth which is a 2-D tensor. When
+              `soft_label` is set to `False`, `label` is a tensor<int64> with shape
+              [N x 1]. When `soft_label` is set to `True`, `label` is a
               tensor<float/double> with shape [N x D].
         soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate
               the given labels as soft labels, default `False`.
@@ -403,7 +432,7 @@ def cross_entropy(input, label, **kwargs):
          A 2-D tensor with shape [N x 1], the cross entropy loss.
 
     Raises:
-        `ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \ 
+        `ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \
               `soft_label == True`, and the 2nd dimension of `input` and `label` are not \
                equal; 3) when `soft_label == False`, and the 2nd dimension of `label` is not 1.
 
@@ -727,9 +756,9 @@ def conv2d(input,
 
 def sequence_pool(input, pool_type, **kwargs):
     """
-    This function add the operator for sequence pooling. 
-    It pools features of all time-steps of each instance, and is applied 
-    on top of the input using pool_type mentioned in the parameters. 
+    This function add the operator for sequence pooling.
+    It pools features of all time-steps of each instance, and is applied
+    on top of the input using pool_type mentioned in the parameters.
 
     It supports four pool_type:
 
@@ -758,7 +787,7 @@ def sequence_pool(input, pool_type, **kwargs):
 
     Args:
         input(variable): The input variable which is a LoDTensor.
-        pool_type (string): The pooling type of sequence_pool. 
+        pool_type (string): The pooling type of sequence_pool.
             It supports average, sum, sqrt and max.
 
     Returns:
@@ -768,7 +797,7 @@ def sequence_pool(input, pool_type, **kwargs):
 
         .. code-block:: python
 
-             x = fluid.layers.data(name='x', shape=[7, 1], 
+             x = fluid.layers.data(name='x', shape=[7, 1],
                               dtype='float32', lod_level=1)
              avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
              sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
@@ -816,7 +845,7 @@ def sequence_first_step(input, **kwargs):
 
         .. code-block:: python
 
-             x = fluid.layers.data(name='x', shape=[7, 1], 
+             x = fluid.layers.data(name='x', shape=[7, 1],
                               dtype='float32', lod_level=1)
              x_first_step = fluid.layers.sequence_first_step(input=x)
     """
@@ -849,7 +878,7 @@ def sequence_last_step(input, **kwargs):
 
         .. code-block:: python
 
-             x = fluid.layers.data(name='x', shape=[7, 1], 
+             x = fluid.layers.data(name='x', shape=[7, 1],
                               dtype='float32', lod_level=1)
              x_last_step = fluid.layers.sequence_last_step(input=x)
     """
@@ -1168,25 +1197,26 @@ def lstm_unit(x_t,
 
         .. math::
 
-            i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
+            i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i)
 
-            f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
+            f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + b_f)
 
-            c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
+            c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t + W_{h_c}h_{t-1} + b_c)
 
-            o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
+            o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + b_o)
 
             h_t & = o_t tanh(c_t)
 
-    The inputs of lstm unit includes :math:`x_t`, :math:`h_{t-1}` and
-    :math:`c_{t-1}`. The implementation separates the linear transformation
-    and non-linear transformation apart. Here, we take :math:`i_t` as an
-    example. The linear transformation is applied by calling a `fc` layer and
-    the equation is:
+    The inputs of lstm unit include :math:`x_t`, :math:`h_{t-1}` and
+    :math:`c_{t-1}`. The 2nd dimensions of :math:`h_{t-1}` and :math:`c_{t-1}`
+    should be same. The implementation separates the linear transformation and
+    non-linear transformation apart. Here, we take :math:`i_t` as an example.
+    The linear transformation is applied by calling a `fc` layer and the
+    equation is:
 
         .. math::
 
-            L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i
+            L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i
 
     The non-linear transformation is applied by calling `lstm_unit_op` and the
     equation is:
@@ -1198,9 +1228,12 @@ def lstm_unit(x_t,
     This layer has two outputs including :math:`h_t` and :math:`o_t`.
 
     Args:
-        x_t (Variable): The input value of current step.
-        hidden_t_prev (Variable): The hidden value of lstm unit.
-        cell_t_prev (Variable): The cell value of lstm unit.
+        x_t (Variable): The input value of current step, a 2-D tensor with shape
+            M x N, M for batch size and N for input size.
+        hidden_t_prev (Variable): The hidden value of lstm unit, a 2-D tensor
+            with shape M x S, M for batch size and S for size of lstm unit.
+        cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor with
+            shape M x S, M for batch size and S for size of lstm unit.
         forget_bias (float): The forget bias of lstm unit.
         param_attr (ParamAttr): The attributes of parameter weights, used to set
             initializer, name etc.
@@ -1213,14 +1246,15 @@ def lstm_unit(x_t,
     Raises:
         ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\
                 not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** \
-                and **cell_t_prev** not be the same.
+                and **cell_t_prev** not be the same or the 2nd dimensions of \
+                **hidden_t_prev** and **cell_t_prev** not be the same.
 
     Examples:
 
         .. code-block:: python
 
              x_t = fluid.layers.fc(input=x_t_data, size=10)
-             prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=20)
+             prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=30)
              prev_cell = fluid.layers.fc(input=prev_cell_data, size=30)
              hidden_value, cell_value = fluid.layers.lstm_unit(x_t=x_t,
                                                     hidden_t_prev=prev_hidden,
@@ -1239,7 +1273,11 @@ def lstm_unit(x_t,
 
     if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[
             0] != cell_t_prev.shape[0]:
-        raise ValueError("The 1s dimension of x_t, hidden_t_prev and "
+        raise ValueError("The 1st dimensions of x_t, hidden_t_prev and "
+                         "cell_t_prev must be the same.")
+
+    if hidden_t_prev.shape[1] != cell_t_prev.shape[1]:
+        raise ValueError("The 2nd dimensions of hidden_t_prev and "
                          "cell_t_prev must be the same.")
 
     if bias_attr is None:
@@ -1268,17 +1306,17 @@ def lstm_unit(x_t,
 
 def reduce_sum(input, dim=None, keep_dim=False):
     """
-    Computes the sum of tensor elements over the given dimension. 
+    Computes the sum of tensor elements over the given dimension.
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the sum is performed. If 
-            :attr:`None`, sum all elements of :attr:`input` and return a 
-            Tensor variable with a single element, otherwise must be in the 
-            range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`, 
+        dim (int|None): The dimension along which the sum is performed. If
+            :attr:`None`, sum all elements of :attr:`input` and return a
+            Tensor variable with a single element, otherwise must be in the
+            range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`,
             the dimension to reduce is :math:`rank + dim`.
-        keep_dim (bool): Whether to reserve the reduced dimension in the 
-            output Tensor. The result tensor will have one fewer dimension 
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
 
     Returns:
@@ -1312,17 +1350,17 @@ def reduce_sum(input, dim=None, keep_dim=False):
 
 def reduce_mean(input, dim=None, keep_dim=False):
     """
-    Computes the mean of tensor elements over the given dimension. 
+    Computes the mean of tensor elements over the given dimension.
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the mean is computed. If 
-            :attr:`None`, compute the mean over all elements of :attr:`input` 
-            and return a Tensor variable with a single element, otherwise 
-            must be in the range :math:`[-rank(input), rank(input))`. If 
+        dim (int|None): The dimension along which the mean is computed. If
+            :attr:`None`, compute the mean over all elements of :attr:`input`
+            and return a Tensor variable with a single element, otherwise
+            must be in the range :math:`[-rank(input), rank(input))`. If
             :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
-        keep_dim (bool): Whether to reserve the reduced dimension in the 
-            output Tensor. The result tensor will have one fewer dimension 
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
 
     Returns:
@@ -1356,22 +1394,22 @@ def reduce_mean(input, dim=None, keep_dim=False):
 
 def reduce_max(input, dim=None, keep_dim=False):
     """
-    Computes the maximum of tensor elements over the given dimension. 
+    Computes the maximum of tensor elements over the given dimension.
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the maximum is computed. 
-            If :attr:`None`, compute the maximum over all elements of 
-            :attr:`input` and return a Tensor variable with a single element, 
-            otherwise must be in the range :math:`[-rank(input), rank(input))`. 
+        dim (int|None): The dimension along which the maximum is computed.
+            If :attr:`None`, compute the maximum over all elements of
+            :attr:`input` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-rank(input), rank(input))`.
             If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
-        keep_dim (bool): Whether to reserve the reduced dimension in the 
-            output Tensor. The result tensor will have one fewer dimension 
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
 
     Returns:
         Variable: The reduced Tensor variable.
-    
+
     Examples:
         .. code-block:: python
 
@@ -1400,22 +1438,22 @@ def reduce_max(input, dim=None, keep_dim=False):
 
 def reduce_min(input, dim=None, keep_dim=False):
     """
-    Computes the minimum of tensor elements over the given dimension. 
+    Computes the minimum of tensor elements over the given dimension.
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the minimum is computed. 
-            If :attr:`None`, compute the minimum over all elements of 
-            :attr:`input` and return a Tensor variable with a single element, 
-            otherwise must be in the range :math:`[-rank(input), rank(input))`. 
+        dim (int|None): The dimension along which the minimum is computed.
+            If :attr:`None`, compute the minimum over all elements of
+            :attr:`input` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-rank(input), rank(input))`.
             If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
-        keep_dim (bool): Whether to reserve the reduced dimension in the 
-            output Tensor. The result tensor will have one fewer dimension 
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
 
     Returns:
         Variable: The reduced Tensor variable.
-    
+
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
index d2ff6841a317aaf6903edadc9213f69ef6c41216..23fe13f9bbf3e81802ac86415472e6aa603711b1 100644
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -1,9 +1,24 @@
 from ..registry import register_layer
-__all__ = [
-    'mean', 'mul', 'dropout', 'reshape', 'sigmoid', 'scale', 'transpose',
-    'sigmoid_cross_entropy_with_logits', 'elementwise_add', 'elementwise_div',
-    'elementwise_sub', 'elementwise_mul', 'clip', 'abs', 'sequence_softmax'
+
+__activations__ = [
+    'abs', 'tanh', 'sigmoid', 'relu', 'sqrt', 'ceil', 'floor', 'log', 'round'
 ]
 
+__all__ = [
+    'mean',
+    'mul',
+    'dropout',
+    'reshape',
+    'scale',
+    'transpose',
+    'sigmoid_cross_entropy_with_logits',
+    'elementwise_add',
+    'elementwise_div',
+    'elementwise_sub',
+    'elementwise_mul',
+    'clip',
+    'sequence_softmax',
+] + __activations__
+
 for _OP in set(__all__):
     globals()[_OP] = register_layer(_OP)
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
index 8acd470c5ed5fa8eeda396f1e9182db4ecdd7016..74ca56182c47de2e74e80a56bf84dcf90ca6c104 100644
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -4,6 +4,7 @@ import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
 import paddle.v2.fluid as fluid
+import time
 
 word_dict, verb_dict, label_dict = conll05.get_dict()
 word_dict_len = len(word_dict)
@@ -160,7 +161,8 @@ def main():
         paddle.reader.shuffle(
             paddle.dataset.conll05.test(), buf_size=8192),
         batch_size=BATCH_SIZE)
-    place = fluid.CPUPlace()
+    #place = fluid.CPUPlace()
+    place = fluid.CUDAPlace(0)
     feeder = fluid.DataFeeder(
         feed_list=[
             word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
@@ -174,6 +176,7 @@ def main():
     embedding_param.set(
         load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)
 
+    start_time = time.time()
     batch_id = 0
     for pass_id in xrange(PASS_NUM):
         chunk_evaluator.reset(exe)
@@ -191,6 +194,9 @@ def main():
                         f1_score) + " pass_precision:" + str(
                             pass_precision) + " pass_recall:" + str(pass_recall)
                       + " pass_f1_score:" + str(pass_f1_score))
+                if batch_id != 0:
+                    print("second per batch: " + str((time.time() - start_time)
+                                                     / batch_id))
 
             # exit early for CI
             exit(0)
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
index fc073f6be8563a363c0f98b9235ae267fa68562d..51bfe2973db7bd2ec4b43bb588be4c1fcfb11e74 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
@@ -14,6 +14,7 @@ hidden1 = fluid.layers.fc(input=image,
                           param_attr=fluid.ParamAttr(
                               regularizer=regularizer,
                               clip=fluid.clip.ClipByValue(10)))
+
 hidden2 = fluid.layers.fc(input=hidden1,
                           size=64,
                           act='relu',
@@ -73,5 +74,9 @@ for pass_id in range(PASS_NUM):
               + " test_acc=" + str(test_pass_acc))
 
         if test_pass_acc > 0.7:
+            fluid.io.save_inference_model(
+                "./recognize_digits_mlp.inference.model/", ["x"], [predict],
+                exe)
             exit(0)
+
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py
index e82e3ab0c9c0bc75a13a8948fda925bc4f0b6512..958300e655e012b91598360105ca2734c3bd2c37 100644
--- a/python/paddle/v2/fluid/tests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
@@ -1,5 +1,7 @@
 import unittest
 import numpy as np
+
+import paddle.v2.fluid.core as core
 from op_test import OpTest
 
 
@@ -47,6 +49,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
 
 class TestConv2dOp(OpTest):
     def setUp(self):
+        core.use_cuda()
         self.init_op_type()
         self.init_group()
         self.init_dilation()
@@ -167,26 +170,31 @@ class TestWithDilation(TestConv2dOp):
 #----------------Conv2dCudnn----------------
 class TestCudnn(TestConv2dOp):
     def init_op_type(self):
+        core.use_cudnn()
         self.op_type = "conv2d_cudnn"
 
 
 class TestCudnnWithPad(TestWithPad):
     def init_op_type(self):
+        core.use_cudnn()
         self.op_type = "conv2d_cudnn"
 
 
 class TestCudnnWithStride(TestWithStride):
     def init_op_type(self):
+        core.use_cudnn()
         self.op_type = "conv2d_cudnn"
 
 
 class TestCudnnWithGroup(TestWithGroup):
     def init_op_type(self):
+        core.use_cudnn()
         self.op_type = "conv2d_cudnn"
 
 
 class TestCudnnWith1x1(TestWith1x1):
     def init_op_type(self):
+        core.use_cudnn()
         self.op_type = "conv2d_cudnn"
 
 
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 9d2dcca56dd1361b9e2448be9f1d5403f8ee17e3..77f0f11f1bcd5fa88700a33eec5a2abc2666ed02 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -177,8 +177,8 @@ class TestBook(unittest.TestCase):
                 name='x_t_data', shape=[10, 10], dtype='float32')
             x_t = layers.fc(input=x_t_data, size=10)
             prev_hidden_data = layers.data(
-                name='prev_hidden_data', shape=[10, 20], dtype='float32')
-            prev_hidden = layers.fc(input=prev_hidden_data, size=20)
+                name='prev_hidden_data', shape=[10, 30], dtype='float32')
+            prev_hidden = layers.fc(input=prev_hidden_data, size=30)
             prev_cell_data = layers.data(
                 name='prev_cell', shape=[10, 30], dtype='float32')
             prev_cell = layers.fc(input=prev_cell_data, size=30)
diff --git a/python/paddle/v2/fluid/tests/test_parallel_op.py b/python/paddle/v2/fluid/tests/test_parallel_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2788f4e519b31b45250fbb923b2309e8bb1f6fa1
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_parallel_op.py
@@ -0,0 +1,46 @@
+import unittest
+
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid as fluid
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward
+import numpy as np
+import paddle.v2.fluid.core as core
+
+
+class ParallelOpTest(unittest.TestCase):
+    def setUp(self):
+        x = layers.data(
+            shape=[-1, 30, 40],
+            dtype='float32',
+            name='x',
+            append_batch_size=False,
+            stop_gradient=False)
+
+        places = fluid.default_main_program().global_block().create_var()
+        pd = layers.ParallelDo(places=places)
+
+        with pd.do():
+            data = pd.read_input(x)
+            hidden = layers.fc(input=data, size=7)
+            pd.write_output(hidden)
+        data = pd()
+        loss = layers.mean(x=data)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+        sgd_optimizer.minimize(loss)
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(fluid.default_startup_program())
+        exe.run(fluid.default_main_program(),
+                feed={
+                    x.name: np.random.uniform(0.1, 0.6,
+                                              (20, 30, 40)).astype("float32")
+                })
+
+    def test_forward(self):
+        pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
index 7c136f6360ce73a7c532b5486e544796e6853bcb..8b79d448e263d00849877c29158d7898bafe1937 100644
--- a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
@@ -1,46 +1,186 @@
 import unittest
 import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
 import numpy
 
 
 class TestReorderLoDTensor(unittest.TestCase):
-    def test_reorder(self):
-        dat = fluid.layers.data(name='input', shape=[1], lod_level=2)
+    num_seq = 5
+    # [name, dim, lod_level] pair indicating data info of source and target
+    data_desc = (['input', 9, 0], ['ref', 5, 1])
+
+    @classmethod
+    def setUpClass(cls):
+        cls.set_program()
+
+    @classmethod
+    def set_program(cls):
+        dat = fluid.layers.data(
+            name=cls.data_desc[0][0], shape=[cls.data_desc[0][1]])
         dat.stop_gradient = False
-        rank_dat = fluid.layers.data(name='ref', shape=[1], lod_level=1)
+        rank_dat = fluid.layers.data(
+            name=cls.data_desc[1][0], shape=[cls.data_desc[1][1]])
         table = fluid.layers.lod_rank_table(rank_dat)
         new_dat = fluid.layers.reorder_lod_tensor_by_rank(
             x=dat, rank_table=table)
-        loss = fluid.layers.mean(x=new_dat)
+        loss = fluid.layers.reduce_sum(new_dat)
         fluid.backward.append_backward(loss=loss)
+        cls.fetch_list = [new_dat, cls.data_desc[0][0] + '@GRAD']
+
+    def run_program(self):
+        outputs = []
+        input_grads = []
+        places = [core.CPUPlace()]
+        if core.is_compile_gpu():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.set_inputs(place)
+            exe = fluid.Executor(place)
+            output, input_grad = exe.run(fluid.default_main_program(),
+                                         feed=self.inputs,
+                                         fetch_list=self.fetch_list,
+                                         return_numpy=False)
+            outputs.append(output)
+            input_grads.append(input_grad)
+        self.actual_outputs = outputs
+        self.actual_grads = input_grads
+
+    def set_data(self):
+        self.data = {}
+        for desc in self.data_desc:
+            data_name = desc[0]
+            data_dim = desc[1]
+            data_lod_level = desc[2]
+            data_lod = []
+            for i in range(data_lod_level):
+                lod_level_i = numpy.random.randint(
+                    low=1,
+                    high=5,
+                    size=self.num_seq if i == 0 else lod_level_i[-1])
+                lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist()
+                data_lod.append(lod_level_i)
+            data_value = numpy.random.random(size=[
+                data_lod[-1][-1] if data_lod else self.num_seq, data_dim
+            ]).astype('float32')
+            self.data[data_name] = (data_value, data_lod)
+
+    def set_inputs(self, place):
+        self.inputs = {}
+        for desc in self.data_desc:
+            tensor = fluid.Tensor()
+            tensor.set(self.data[desc[0]][0], place)
+            if self.data[desc[0]][1]:
+                tensor.set_lod(self.data[desc[0]][1])
+            self.inputs[desc[0]] = tensor
+
+    def reorder(self):
+        level = 0
+
+        # compute the rank_table according to ref_lod
+        ref_lod = self.data[self.data_desc[1][0]][1][level]
+        rank_table = []  # list of (index, length)
+        for i in range(len(ref_lod) - 1):
+            rank_table.append((i, ref_lod[i + 1] - ref_lod[i]))
+        rank_table = sorted(rank_table, lambda x, y: y[1] - x[1])
+
+        # compute the input sequence info according to input_lod
+        input_value, input_lod = self.data[self.data_desc[0][0]]
+
+        input_table = []  # list of (offset, length, sub_lod)
+        if input_lod:
+            for i in range(len(input_lod[level]) - 1):
+                start_idx = i
+                end_idx = i + 1
+                sub_lod = []
+                for lod_level_i in input_lod[level:]:
+                    sub_lod_i = []
+                    for idx in range(start_idx, end_idx):
+                        sub_lod_i.append(lod_level_i[idx + 1] - lod_level_i[
+                            idx])
+                    sub_lod.append(sub_lod_i)
+                    start_idx = lod_level_i[start_idx]
+                    end_idx = lod_level_i[end_idx]
+                input_table.append((start_idx, end_idx - start_idx, sub_lod))
+        else:
+            input_table = [(i, 1, []) for i in range(len(rank_table))]
+
+        # reorder by rank_table
+        output_value = numpy.zeros_like(input_value)
+        output_lod = []
+        offset = 0
+        for index, length in rank_table:
+            input_seq_start = input_table[index][0]
+            input_seq_len = input_table[index][1]
+            input_seq_end = input_seq_start + input_seq_len
+            output_value[offset:offset + input_seq_len] = input_value[
+                input_seq_start:input_seq_end]
+            offset += input_seq_len
+
+            input_seq_sub_lod = input_table[index][2]
+            if len(output_lod) == 0:
+                output_lod = [[0] for i in input_seq_sub_lod]
+            for i, sub_lod_i in enumerate(input_seq_sub_lod):
+                for idx_sub in sub_lod_i:
+                    output_lod[i].append(output_lod[i][-1] + idx_sub)
+        return output_value, output_lod
+
+    def test_reorder_lod_tensor(self):
+        self.data_desc[0][-1] = 2  # input is lod_tensor
+        self.set_data()
+        self.run_program()
+        # check output
+        expect_output, expect_output_lod = self.reorder()
+        for actual_output in self.actual_outputs:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_output), expect_output, atol=0.001))
+            self.assertEqual(expect_output_lod, actual_output.lod())
+        # check gradient
+        expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
+        expect_grad_lod = self.data[self.data_desc[0][0]][1]
+        for actual_grad in self.actual_grads:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_grad), expect_grad, atol=0.001))
+            self.assertEqual(expect_grad_lod, actual_grad.lod())
+
+    def test_reorder_tensor(self):
+        self.data_desc[0][-1] = 0  # input is tensor
+        self.set_data()
+        self.run_program()
+        # check output
+        expect_output, expect_output_lod = self.reorder()
+        for actual_output in self.actual_outputs:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_output), expect_output, atol=0.001))
+            self.assertEqual(expect_output_lod, actual_output.lod())
+        # check gradient
+        expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
+        expect_grad_lod = self.data[self.data_desc[0][0]][1]
+        for actual_grad in self.actual_grads:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_grad), expect_grad, atol=0.001))
+            self.assertEqual(expect_grad_lod, actual_grad.lod())
+        global outputs_from_tensor_implicit_lod
+        outputs_from_tensor_implicit_lod = self.actual_outputs
 
-        cpu = fluid.CPUPlace()
-        exe = fluid.Executor(cpu)
-        exe.run(fluid.default_startup_program())
-
-        ref = fluid.Tensor()
-        ref_lod = [0, 3, 4, 7, 8, 14]
-        ref.set_lod([ref_lod])
-
-        ref.set(numpy.random.random(size=[14, 1]).astype('float32'), cpu)
-        input = fluid.Tensor()
-        lod_level_0 = numpy.random.randint(low=1, high=5, size=5)
-        lod_level_0 = [0] + numpy.cumsum(lod_level_0).tolist()
-        lod_level_1 = numpy.random.randint(low=1, high=5, size=lod_level_0[-1])
-        lod_level_1 = [0] + numpy.cumsum(lod_level_1).tolist()
-
-        input.set_lod([lod_level_0, lod_level_1])
-        input.set(
-            numpy.random.random(size=[lod_level_1[-1], 1]).astype('float32'),
-            cpu)
-
-        ig = exe.run(fluid.default_main_program(),
-                     feed={'input': input,
-                           'ref': ref},
-                     fetch_list=['input@GRAD'],
-                     return_numpy=False)[0]
-        self.assertAlmostEqual(numpy.array(ig).sum(), 1.0, delta=0.001)
-        self.assertEqual(input.lod(), ig.lod())
+        # compare outputs between LodTensors with explicit and implicit lod
+        # use the same data but set the input lod explicitly
+        input_lod = [[
+            i for i in range(len(self.data[self.data_desc[0][0]][0]) + 1)
+        ]]
+        self.inputs[self.data_desc[0][0]].set_lod(input_lod)
+        # preserve the output of LodTensor with implicit lod to compare
+        expect_output = [
+            numpy.array(actual_output) for actual_output in self.actual_outputs
+        ]
+        self.run_program()
+        for actual_output in self.actual_outputs:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_output), expect_output, atol=0.001))
 
 
 if __name__ == '__main__':