diff --git a/.clang-format b/.clang-format
index 9ba433b17362424973626470d930356c2173dd84..aff93435f58c522f5ed1090aef2005f76e91cf31 100644
--- a/.clang-format
+++ b/.clang-format
@@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
 BinPackArguments: false
 ...
-
diff --git a/.travis.yml b/.travis.yml
index c51e02eb79a9e53a2b8d1d663e8f0c3e0d8c3a61..e2d49daa1981396628efa5d16459eb70e9e76884 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -42,7 +42,7 @@ before_install:
 script:
   - |
     timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
   - |
     if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
     if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65164b8472b902be8b0b9d5fb99807d012b8a666..e76512166fcaea5daf2a67d1259331b680f15b7c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -133,6 +133,8 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/nccl)
+include(external/cares)
+include(external/grpc)
 
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
diff --git a/Dockerfile b/Dockerfile
index 150344a8116e2be9b5bab8e5fdcc9c37f4025020..857d3f3e5f64791146741ffb29feabfcb2ecbb84 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,7 +29,7 @@ RUN apt-get update && \
     automake locales clang-format swig doxygen cmake  \
     liblapack-dev liblapacke-dev libboost-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools && \
+    net-tools libtool && \
     apt-get clean -y
 
 # Install Go and glide
diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..e05111ee18efc906e39bcb56fb1be3b3c3dff5d6
--- /dev/null
+++ b/cmake/external/cares.cmake
@@ -0,0 +1,45 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+# NOTE: c-ares is needed when linking with grpc.
+
+SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares)
+SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares)
+SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE)
+
+ExternalProject_Add(
+    extern_cares
+    GIT_REPOSITORY "https://github.com/c-ares/c-ares.git"
+    GIT_TAG "cares-1_13_0"
+    PREFIX          ${CARES_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND   make
+    INSTALL_COMMAND make install
+)
+
+ADD_LIBRARY(cares STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION
+             "${CARES_INSTALL_DIR}/lib/libcares.a")
+
+include_directories(${CARES_INCLUDE_DIR})
+ADD_DEPENDENCIES(cares extern_cares)
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..86122aec8c77f34756a37121582b92489d749d7f
--- /dev/null
+++ b/cmake/external/grpc.cmake
@@ -0,0 +1,66 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
+SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
+SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
+SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
+IF(APPLE)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+ELSE()
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
+ENDIF()
+
+ExternalProject_Add(
+    extern_grpc
+    DEPENDS protobuf zlib
+    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
+    GIT_TAG "v1.7.x"
+    PREFIX          ${GRPC_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    # NOTE(yuyang18):
+    # Disable -Werror, otherwise the compile will fail in MacOS.
+    # It seems that we cannot configure that by make command.
+    # Just dry run make command and remove `-Werror`, then use a shell to run make commands
+    BUILD_COMMAND  ${BUILD_CMD}
+    INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
+)
+
+# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
+ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
+             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
+
+ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
+ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgpr.a")
+
+ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
+
+include_directories(${GRPC_INCLUDE_DIR})
+ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index be7f6a9465970711170bd15dcecaadeaa8a55f86..7cfe1e68078eed023fd0cc6971c573bb0108b4cc 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -15,7 +15,18 @@
 INCLUDE(ExternalProject)
 # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
 FIND_PACKAGE(Protobuf QUIET)
-SET(PROTOBUF_FOUND "OFF")
+macro(UNSET_VAR VAR_NAME)
+    UNSET(${VAR_NAME} CACHE)
+    UNSET(${VAR_NAME})
+endmacro()
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(PROTOBUF_FOUND)
+UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
+UNSET_VAR(PROTOBUF_PROTOC_LIBRARY)
+UNSET_VAR(PROTOBUF_LITE_LIBRARY)
+UNSET_VAR(PROTOBUF_LIBRARY)
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
 
 if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
     function(protobuf_generate_python SRCS)
@@ -110,7 +121,6 @@ macro(PROMPT_PROTOBUF_LIB)
     # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
     # make `protobuf_generate_cpp` happy.
     SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
-
     FOREACH(dep ${protobuf_DEPS})
         ADD_DEPENDENCIES(protobuf ${dep})
         ADD_DEPENDENCIES(protobuf_lite ${dep})
@@ -128,11 +138,11 @@ endmacro()
 
 set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
-    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include)
-    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib)
-    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib)
-    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib)
-    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin)
+    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
     if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
         message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
         SET_PROTOBUF_VERSION()
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index a98e069b7cd1654ddd5868560d0905eab6d9c692..1638cd8fdfc34575132462859e056a1907f0b2f1 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -50,6 +50,8 @@ ExternalProject_Add(
 )
 
 LIST(APPEND external_project_dependencies zlib)
+ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
 
 IF(WITH_C_API)
   INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 404717187d08febd7c1486b31159d647f0ef3357..c917ca0ff4e087b7caae8876da127bec6b39b798 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -459,11 +459,58 @@ function(py_test TARGET_NAME)
   if(WITH_TESTING)
     set(options STATIC static SHARED shared)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})  
+    set(multiValueArgs SRCS DEPS ARGS)
+    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
              COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
-             ${PYTHON_EXECUTABLE} ${py_test_SRCS}
+             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 endfunction()
+
+# grpc_library generate grpc code using grpc_cpp_plugin and protoc
+# then build the generated protobuf code and grpc code with your
+# implementation source codes together. Use SRCS argument for your
+# implementation source files and PROTO argument for your .proto
+# files.
+#
+# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep)
+
+function(grpc_library TARGET_NAME)
+  set(oneValueArgs PROTO)
+  set(multiValueArgs SRCS DEPS)
+  set(options "")
+  cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  message(STATUS "generating grpc ${grpc_library_PROTO}")
+
+  get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE)
+  get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
+  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
+
+  protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
+  set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
+  set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
+  cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}")
+
+  add_custom_command(
+          OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}"
+          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+          ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
+          --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
+          DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
+
+  # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
+  # as compiler warnings instead of error. Should try remove the warnings also.
+  set_source_files_properties(
+    ${grpc_grpc_srcs}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
+
+  set_source_files_properties(
+    ${grpc_library_SRCS}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
+endfunction()
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index d4d182f6692e09b3e40f3620b77d9a0f20ec5af3..c3f9c18d0663a7a24880b441981875c1e4f015aa 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -54,7 +54,7 @@ img_conv
 
 ..  _api_v2.layer_context_projection:
 
-context_projection 
+context_projection
 ------------------
 ..  autoclass:: paddle.v2.layer.context_projection
     :noindex:
@@ -70,7 +70,7 @@ Image Pooling Layer
 img_pool
 --------
 ..  autoclass:: paddle.v2.layer.img_pool
-    :noindex:   
+    :noindex:
 
 spp
 ---
@@ -104,7 +104,7 @@ sum_to_one_norm
 ---------------
 ..  autoclass:: paddle.v2.layer.sum_to_one_norm
     :noindex:
-    
+
 cross_channel_norm
 ------------------
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
@@ -114,7 +114,7 @@ row_l2_norm
 -----------
 ..  autoclass:: paddle.v2.layer.row_l2_norm
     :noindex:
-    
+
 Recurrent Layers
 ================
 
@@ -415,6 +415,13 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
     :noindex:
 
+Factorization Machine Layer
+============================
+
+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:
 
 Slicing and Joining Layers
 ==========================
diff --git a/doc/design/float16.md b/doc/design/float16.md
index 078801ba2ed969d26dd31d5ec4ed268686cf7016..1ea95ed6b5d6792171569b6ff76d09be92fcb13e 100644
--- a/doc/design/float16.md
+++ b/doc/design/float16.md
@@ -28,6 +28,51 @@ The goal of float16 is to serve as a key for the executor to find and run the co
 - [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
 - [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
 
+### CUDA version issue
+There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. 
+CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
+```
+typedef struct __align__(2) {
+   unsigned short x;
+} __half;
+
+typedef __half half;
+```
+This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
+```
+__global__ void Add() {
+  half a, b, c;
+  c = __hadd(a, b); // correct
+  c = a + b; // compiler error: no operator "+" matches these operands
+}
+```
+CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
+
+Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
+```
+typedef struct __CUDA_ALIGN__(2) {
+    unsigned short x;
+} __half_raw;
+
+
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    unsigned short __x;
+public:
+    // constructors and conversion operators from/to 
+    // __half_raw and other built-in data types
+}
+
+typedef __half half;
+
+__device__ __forceinline__ 
+__half operator+(const __half &lh, const __half &rh) { 
+    return __hadd(lh, rh); 
+}
+
+// Other overloaded operators
+``` 
+This new design makes `c = a + b` work correctly for CUDA half data type. 
 
 ## Implementation
 The float16 class holds a 16-bit `uint16_t` data internally.
diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/refactor/distributed_architecture.md
index ac7e98ccf1aadbb973a4801fde842375cf63448c..2b4f921ae93c3b443ed62a28b1fa9fbda14f73ab 100644
--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/design/refactor/distributed_architecture.md
@@ -2,106 +2,70 @@
 
 ## Abstract
 
-PaddlePaddle v0.10.0 uses the "trainer-parameter server"
-architecture. We run multiple replicated instances of trainers (runs
-the same code written by the user) and parameter servers for
-distributed training. This architecture served us well, but has some
-limitations:
+PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations:
 
-1. Need to write special code to handle tasks which should only be run
-  by a single trainer. E.g., initializing model and saving model.
+1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc.
 
-2. Model parallelism is hard: need to write if-else branches conditioned
-  on the trainer ID to partition model onto each trainer, and manually
-  write the inter-model-shard communication code.
+2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers.
 
-3. The user can not directly specify the parameter update rule: need
-   to modify the parameter server C++ code and compile a new
-   binary. This adds complication for researchers: A lot of extra
-   effort is required. Besides, the training job submission program
-   may not allow running arbitrary binaries.
+3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries.
 
-This design doc discusses PaddlePaddle's new distributed training
-architecture that addresses the above limitations.
+This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations.
 
 ## Analysis
 
-We will assume the user writes the trainer program by Python, the same
-analysis holds if the trainer program is written in C++.
+The assumption is that the user writes the trainer program in either Python or C++.
 
 ### Limitation 1
 
-If we look at the Python code that the user writes, there are two
-kinds of functionalities:
+There are two basic functionalities in the trainer program:
 
-- The training logic such as load / save model and print log.
-- The neural network definition such as the definition of the data
-  layer, the fully connected layer, the cost function and the
+1. The training logic such as loading / saving the model and printing out the logs.
+2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the
   optimizer.
 
-When we training with PaddlePaddle v0.10.0 distributedly, multiple
-replicated Python instances are running on different nodes: both the
-training logic and the neural network computation is replicated.
+When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the
+training logic as well as the neural network computation logic, is replicated.
 
-The tasks that should only run once all belong to the training logic,
-if we only replicate the neural network computation, but do **not**
-replicate the training logic, the limitation could be solved.
+The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not**
+replicate the training logic, the limitation mentioned above can be avoided.
 
 ### Limitation 2
 
-Model parallelism means running a single model on multiple nodes by
-partitioning the model onto different nodes and managing the
-inter-model-shard communications.
+Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the
+inter-model-shard communication between nodes.
 
-PaddlePaddle should be able to modify the nerual network computation
-definition to support model parallelism automatically. However, the
-computation is only specified in Python code, and PaddlePaddle can not
-modify Python code.
+PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the
+computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup.
 
-Just like compiler uses a intermediate representation (IR) so that
-programmer does not need to manually optimize their code in most of
-the cases - the compiler will optimize the IR:
+Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
 
 <img src="src/compiler.png"/>
 
-We can have our own IR too: PaddlePaddle can support model parallel by
-converting the IR so the user no longer need to manually do it in
-Python:
+PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
 
 <img src="src/paddle-compile.png"/>
 
-The IR for PaddlePaddle after refactor is called `Block`, it specifies
-the computation dependency graph and the variables used in the
-computation.
+The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
 
 ### Limitation 3
 
-The user can not directly specify the parameter update rule for the
-parameter server because the parameter server does not use the same
-computation definition as the trainer. Instead, the update rule is
-baked in the parameter server. The user can not specify the update
-rule in the same way of specifying the trainer computation.
+The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
 
-This could be fixed by making the parameter server run the same
-computation definition as the trainer. For a detailed explanation,
-please
-see
+This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
 [Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
 
 ## Distributed Training Architecture
 
-The new distributed training architecture can address the above
-limitations. Below is the illustration:
+The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
 
 <img src="src/distributed_architecture.png"/>
 
-The architecture includes major components: *PaddlePaddle Python*,
-*PaddlePaddle converter* and *PaddlePaddle runtime*:
+The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*.
 
 ### PaddlePaddle Python
 
-PaddlePaddle Python is the Python library that user's Python trainer
-invoke to build the neural network topology, start training, etc.
+PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc.
 
 ```Python
 paddle.init()
@@ -117,102 +81,60 @@ for i in range(1000):
 	print cost_val
 ```
 
-The code above is a typical Python trainer code, the neural network
-topology is built using helper functions such as
-`paddle.layer.fc`. The training is done by calling `session.eval`
-iteratively.
+The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively.
 
 #### session.eval
 
-As shown in the graph, `session.eval` sends the IR and the evaluation
-inputs/targets to the PaddlePaddle cluster for evaluation. The
-targets can be any variable in the computation graph. When the target
-is the `optimizer` variable, the neural network will be optimized
-once. When the target is the `cost` variable, `session.eval` returns
-the cost value.
+As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation.
+The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken.
 
-The Python `session` is a wrapper of the C++ `Session` class. For more
-information about `Session`, please
-see [Design Doc: Session](./session.md).
+The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md).
 
 ### PaddlePaddle Converter
 
-PaddlePaddle converter automatically converts the IR in the request
-(IR and evaluation inputs/targets) from PaddlePaddle Python to new
-partitioned IRs and dispatch the new IRs and evaluation inputs/targets
-to different PaddlePaddle runtimes. Below are the steps:
+The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed :
 
-1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that
-   fetches the eval targets to the IR.
+1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR.
 
-1. Extract a new computation (sub)graph with `feed` and `fetch` OP as
-   the boundary. The runtime does not need to run the OP that is not
-   dependent by the `fetch` OP.
+2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP.
 
-1. Optimizes the computation graph.
+3. Optimize the computation graph.
 
-1. Place the OPs in the graph onto different devices on different
-   PaddlePaddle runtime according to a placement algorithm and device
-   constraint specified by the user.
+4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user.
 
-1. Partition the graph according to runtime boundaries and add `send` /
-   `recv` OP pair on the runtime boundaries.
+5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries.
 
-1. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+6. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+
+7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python.
 
-1. PaddlePaddle runtimes with the `fetch` OP reports evaluation
-   results back to the converter, the convert reports the evaluation
-   results back to the PaddlePaddle Python.
-   
 The output IRs will be cached to optimize the conversion latency.
 
 
 #### Placement Algorithm
 
-Our first implementation will only support "trainer-parameter server"
-placement: the parameters, initializers, and optimizers are placed on
-the PaddlePaddle runtimes with the parameter server role. And
-everything else will be placed on the PaddlePaddle runtimes with the
-trainer role. This has the same functionality of our
-"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
-is more general and flexible.
+Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
 
-In the future, we will implement the general placement algorithm,
-which makes placements according to the input IR, and a model of
-device computation time and device communication time. Model
-parallelism requires the general placement algorithm.
+In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
 
 
 ### PaddlePaddle Runtime
 
-The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and
-runs the IR. The runtime does not need to do OP placement since it's
-already done by the converter.
+The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter.
 
 
 ### Local Training Architecture
 
-The local training architecture will be the same as the distributed
-training architecture, the differences are everything runs locally,
-and there is just one PaddlePaddle runtime:
+The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
 
 <img src="src/local_architecture.png"/>
 
 
 ### Training Data
 
-In PaddlePaddle v0.10.0, training data is typically read
-with [data reader](../reader/README.md) from Python. This approach is
-no longer efficient when training distributedly since the Python
-process no longer runs on the same node with the trainer processes,
-the Python reader will need to read from the distributed filesystem
-(assuming it has the access) and send to the trainers, doubling the
-network traffic.
-
-When doing distributed training, the user can still use Python data
-reader: the training data are sent with `session.eval`. However should
-be used for debugging purpose only. The users are encouraged to use
-the read data OPs.
+In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic.
+
+When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs.
 
 
 ## References:
diff --git a/doc/getstarted/basic_usage/index_cn.rst b/doc/getstarted/basic_usage/index_cn.rst
deleted file mode 100644
index b473944fc7fb89d3e0a0b330933f2226734bb5bd..0000000000000000000000000000000000000000
--- a/doc/getstarted/basic_usage/index_cn.rst
+++ /dev/null
@@ -1,108 +0,0 @@
-经典的线性回归任务
-==================
-
-PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。
-
-任务简介
---------
-
-我们展示如何用PaddlePaddle解决 `单变量的线性回归 <https://www.baidu.com/s?wd=单变量线性回归>`_ 问题。线性回归的输入是一批点 `(x, y)` ，其中 `y = wx + b + ε`， 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。
-
-一个例子是房产估值。我们假设房产的价格（y）是其大小（x）的一个线性函数，那么我们可以通过收集市场上房子的大小和价格，用来估计线性函数的参数w 和 b。
-
-准备数据
------------
-
-假设变量 `x` 和 `y` 的真实关系为： `y = 2x + 0.3 + ε`，这里展示如何使用观测数据来拟合这一线性关系。首先，Python代码将随机产生2000个观测点，作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。
-
-.. code-block:: python
-
-    # dataprovider.py
-    from paddle.trainer.PyDataProvider2 import *
-    import random
-
-    # 定义输入数据的类型: 2个浮点数
-    @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-    def process(settings, input_file):
-        for i in xrange(2000):
-            x = random.random()
-            yield [x], [2*x+0.3]
-
-训练模型
------------
-
-为了还原 `y = 2x + 0.3`，我们先从一条随机的直线 `y' = wx + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小，最终趋于接近。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
-
-在PaddlePaddle里，该模型的网络配置如下。
-
-.. code-block:: python
-
-    # trainer_config.py
-    from paddle.trainer_config_helpers import *
-
-    # 1. 定义数据来源，调用上面的process函数获得观测数据
-    data_file = 'empty.list'
-    with open(data_file, 'w') as f: f.writelines(' ')
-    define_py_data_sources2(train_list=data_file, test_list=None, 
-                            module='dataprovider', obj='process',args={})
-
-    # 2. 学习算法。控制如何改变模型参数 w 和 b
-    settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-    # 3. 神经网络配置
-    x = data_layer(name='x', size=1)
-    y = data_layer(name='y', size=1)
-    # 线性计算网络层: ȳ = wx + b
-    ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-    # 计算误差函数，即  ȳ 和真实 y 之间的距离
-    cost = square_error_cost(input= ȳ, label=y)
-    outputs(cost)
-
-
-这段简短的配置展示了PaddlePaddle的基本用法：
-
-- 第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的 `process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
-
-- 第二部分主要是选择学习算法，它定义了模型参数改变的规则。PaddlePaddle提供了很多优秀的学习算法，这里使用一个基于momentum的随机梯度下降(SGD)算法，该算法每批量(batch)读取12个采样数据进行随机梯度计算来更新更新。
-
-- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络层，所以很多时候你需要做的只是定义正确的网络层并把它们连接起来。这里使用了三种网络单元：
-    
-    - **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
-    - **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
-    - **回归误差代价层**：回归误差代价层 `square_error_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
-
-定义了网络结构并保存为 `trainer_config.py` 之后，运行以下训练命令：
-
-.. code-block:: bash
-
-    paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
-
-PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加误差代价函数的输出在不断的减小，这意味着模型在训练数据上不断的改进，直到逼近真实解：` y = 2x + 0.3 `
-
-模型检验
------------
-
-训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测，评价预测的效果。在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
-
-PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
-
-.. code-block:: python
-
-    import numpy as np
-    import os
-
-    def load(file_name):
-        with open(file_name, 'rb') as f:
-            f.read(16) # skip header for float type.
-            return np.fromfile(f, dtype=np.float32)
-        
-    print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-    # w=1.999743, b=0.300137
-
-.. image:: ./parameters.png
-     :align: center
-     :scale: 80 %
-
-从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型一致。
-
-这样，我们用PaddlePaddle解决了单变量线性回归问题， 包括数据输入、模型训练和最后的结果验证。
diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst
deleted file mode 100644
index 2cc438ebbe0f97345d25354b93b4ebbd43502415..0000000000000000000000000000000000000000
--- a/doc/getstarted/basic_usage/index_en.rst
+++ /dev/null
@@ -1,101 +0,0 @@
-Simple Linear Regression
-========================
-
-PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
-
-Problem Background
-------------------
-
-Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression <https://en.wikipedia.org/wiki/Simple_linear_regression>`_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
-
-Prepare the Data
------------------
-
-Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
-
-    .. code-block:: python
-
-        # dataprovider.py
-        from paddle.trainer.PyDataProvider2 import *
-        import random
-
-        # define data types of input: 2 real numbers
-        @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-        def process(settings, input_file):
-            for i in xrange(2000):
-                x = random.random()
-                yield [x], [2*x+0.3]
-
-Train a NeuralNetwork
-----------------------
-
-To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle:
-
-    .. code-block:: python
-
-        # trainer_config.py
-        from paddle.trainer_config_helpers import *
-
-        # 1. read data. Suppose you saved above python code as dataprovider.py
-        data_file = 'empty.list'
-        with open(data_file, 'w') as f: f.writelines(' ')
-        define_py_data_sources2(train_list=data_file, test_list=None, 
-                module='dataprovider', obj='process',args={})
-
-        # 2. learning algorithm
-        settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-        # 3. Network configuration
-        x = data_layer(name='x', size=1)
-        y = data_layer(name='y', size=1)
-        y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-        cost = square_error_cost(input=y_predict, label=y)
-        outputs(cost)
-
-Some of the most fundamental usages of PaddlePaddle are demonstrated:
-
--  The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
-
--  The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
-
--  Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
-	-  **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for ``X`` and ``Y``.
-	-  **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
-	-  **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
-
-Now that everything is ready, you can train the network with a simple command line call:
-
-    .. code-block:: bash
- 
-        paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
- 
-
-This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
-
-
-Evaluate the Model
--------------------
-
-Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly.
-
-In PaddlePaddle, training is just to get a collection of model parameters, which are ``w`` and ``b`` in this case. Each parameter is saved in an individual file in the popular ``numpy`` array format. Here is the code that reads parameters from last pass.
-
-    .. code-block:: python
-
-        import numpy as np
-        import os
-
-        def load(file_name):
-            with open(file_name, 'rb') as f:
-                f.read(16) # skip header for float type.
-                return np.fromfile(f, dtype=np.float32)
-                
-        print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-        # w=1.999743, b=0.300137
-
-    .. image:: parameters.png
-        :align: center
-
-Although starts from a random guess, you can see that value of ``w`` changes quickly towards 2 and ``b`` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
-
-There, you have recovered the underlying pattern between ``X`` and ``Y`` only from observed data.
diff --git a/doc/getstarted/basic_usage/parameters.png b/doc/getstarted/basic_usage/parameters.png
deleted file mode 100644
index 2ec67480951e21f0400bce1c34b3108dcd65c18c..0000000000000000000000000000000000000000
Binary files a/doc/getstarted/basic_usage/parameters.png and /dev/null differ
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3c525bdad6f6118dcd560e2cb7bfaf89737c1362
--- /dev/null
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -0,0 +1,141 @@
+从源码编译
+======================
+
+.. _build_step:
+
+编译方法
+----------------
+
+PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译工具。
+我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
+
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
+
+编译PaddlePaddle，需要执行：
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 如果使用Docker编译环境，执行下面的命令编译CPU-Only的二进制
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   # 如果不使用Docker编译环境，执行下面的命令
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+
+编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
+
+.. code-block:: bash
+
+   pip install python/dist/*.whl
+
+
+.. _run_test:
+
+执行单元测试
+----------------
+
+如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
+
+使用Docker的情况下，设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+
+如果不使用Docker，可以执行ctest命令即可：
+
+.. code-block:: bash
+
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+   ctest
+   # 指定执行其中一个单元测试 test_mul_op
+   ctest -R test_mul_op
+
+.. _compile_deps:
+
+编译依赖
+----------------
+
+PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
+
+.. csv-table:: PaddlePaddle编译依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.5", ""
+   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "可选"
+
+
+.. _build_options:
+
+编译选项
+----------------
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
+用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考
+`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如：
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: 编译选项说明
+    :header: "选项", "说明", "默认值"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "是否支持GPU", "ON"
+    "WITH_C_API", "是否仅编译CAPI", "OFF"
+    "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
+    "WITH_DSO", "是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。", "ON"
+    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
+    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
+    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
+    "WITH_TESTING", "是否开启单元测试", "ON"
+    "WITH_DOC", "是否编译中英文文档", "OFF"
+    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
+    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
+
+BLAS
++++++
+
+PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
+`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
+还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+
+如果关闭MKL，则会使用OpenBLAS作为BLAS库。
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
+使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。
+
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。
+我们推荐使用最新版本的cuDNN。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（** :code:`rm -rf` ）**后，再指定。**
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
deleted file mode 100644
index 2f1461489495618718d5abaeab9cbeda9b93700f..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ /dev/null
@@ -1,236 +0,0 @@
-Installing from Sources
-==========================
-
-* [1. Download and Setup](#download)
-* [2. Requirements](#requirements)
-* [3. Build on Ubuntu](#ubuntu)
-* [4. Build on Centos](#centos)
-
-
-## <span id="download">Download and Setup</span> 
-You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
-
-```bash
-git clone https://github.com/PaddlePaddle/Paddle paddle
-cd paddle
-```
-## <span id="requirements">Requirements</span>
-
-To compile the source code, your computer must be equipped with the following dependencies.
-
-- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler
-- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
-- **BLAS**: MKL, OpenBlas or ATLAS
-- **Python**: only support Python 2.7
-- **Go**
-
-**Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
-For CUDA 8.0, GCC versions later than 5.3 are not supported!
-
-### Options
-
-PaddlePaddle supports some build options. 
-
-<html>
-<table> 
-<thead>
-<tr>
-<th scope="col" class="left">Optional</th>
-<th scope="col" class="left">Description</th>
-</tr>
-</thead>
-<tbody>
-<tr><td class="left">WITH_GPU</td><td class="left">Compile PaddlePaddle with NVIDIA GPU</td></tr>
-<tr><td class="left">WITH_AVX</td><td class="left">Compile PaddlePaddle with AVX intrinsics</td></tr>
-<tr><td class="left">WITH_DSO</td><td class="left">Compile PaddlePaddle with dynamic linked CUDA</td></tr>
-<tr><td class="left">WITH_TESTING</td><td class="left">Compile PaddlePaddle with unit testing</td></tr>
-<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile PaddlePaddle with inference api</td></tr>
-<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile PaddlePaddle with style check</td></tr>
-<tr><td class="left">WITH_PYTHON</td><td class="left">Compile PaddlePaddle with python interpreter</td></tr>
-<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile PaddlePaddle with double precision</td></tr>
-<tr><td class="left">WITH_RDMA</td><td class="left">Compile PaddlePaddle with RDMA support</td></tr>
-<tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
-<tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
-<tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
-<tr><td class="left">WITH_COVERAGE</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
-<tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
-<tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
-</tbody>
-</table>
-</html>
-
-**Note:**
-  - The GPU version works best with Cuda Toolkit 8.0 and cuDNN v5.
-  - Other versions like Cuda Toolkit 7.0, 7.5 and cuDNN v3, v4 are also supported.
-  - **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.**
-
-As a simple example, consider the following:  
-
-1. **BLAS Dependencies(optional)**
-  
-    CMake will search BLAS libraries from the system. If not found, OpenBLAS will be downloaded, built and installed automatically.
-    To utilize preinstalled BLAS， you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
-
-    ```bash
-    # specify MKL
-    cmake .. -DMKL_ROOT=<mkl_path>
-    # or specify OpenBLAS
-    cmake .. -DOPENBLAS_ROOT=<openblas_path>
-    ```
-
-2. **Doc Dependencies(optional)**
-
-    To generate PaddlePaddle's documentation, install dependencies and set `-DWITH_DOC=ON` as follows:
-
-    ```bash
-    pip install 'sphinx>=1.4.0'
-    pip install sphinx_rtd_theme recommonmark
-
-    # install doxygen on Ubuntu
-    sudo apt-get install doxygen 
-    # install doxygen on Mac OS X
-    brew install doxygen
-
-    # active docs in cmake
-    cmake .. -DWITH_DOC=ON`
-    ```
-
-## <span id="ubuntu">Build on Ubuntu 14.04</span>
-
-### Install Dependencies
-
-- **Paddle Dependencies**
-
-    ```bash
-    # necessary
-    sudo apt-get update
-    sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake
-    sudo apt-get install -y python python-pip python-numpy libpython-dev bison
-    sudo pip install 'protobuf==3.1.0.post1'
-
-    # Install Go
-    # You can follow https://golang.org/doc/install for a detailed explanation.
-    wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
-    tar -C $HOME -xzf go.tgz && \
-    mkdir $HOME/gopath && \
-    rm go.tgz
-
-    # Setup environment variables
-    export GOROOT=$HOME/go
-    export GOPATH=$HOME/gopath
-    export PATH=$PATH:$GOROOT/bin
-
-    # install cmake 3.4
-    curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
-        cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \
-        cd .. && rm -rf cmake-3.4.1
-    ```
-
-- **GPU Dependencies (optional)**
-
-    To build GPU version, you will need the following installed:
-
-        1. a CUDA-capable GPU
-        2. A supported version of Linux with a GCC compiler and toolchain
-        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
-
-    The CUDA development environment relies on tight integration with the host development environment,
-    including the host compiler and C runtime libraries, and is therefore only supported on
-    distribution versions that have been qualified for this CUDA Toolkit release.
-        
-    After downloading cuDNN library, issue the following commands:
-
-    ```bash
-    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-    ```
-    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
-
-    ```bash
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
-    ```
-
-### Build and Install
-
-As usual, the best option is to create build folder under paddle project directory.
-
-```bash
-mkdir build && cd build
-``` 
-
-Finally, you can build and install PaddlePaddle:
-
-```bash
-# you can add build option here, such as:    
-cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
-# please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
-# set PaddlePaddle installation path in ~/.bashrc
-export PATH=<path to install>/bin:$PATH
-# install PaddlePaddle Python modules.
-sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-```
-
-## <span id="centos">Build on Centos 7</span>
-
-### Install Dependencies
-
-- **CPU Dependencies**
-
-    ```bash
-    # necessary
-    sudo yum update
-    sudo yum install -y epel-release
-    sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git
-    sudo pip install wheel numpy
-    sudo pip install 'protobuf>=3.0.0'
-    ```
-  
-- **GPU Dependencies (optional)**
-
-    To build GPU version, you will need the following installed:
-
-        1. a CUDA-capable GPU
-        2. A supported version of Linux with a GCC compiler and toolchain
-        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
-
-    The CUDA development environment relies on tight integration with the host development environment,
-    including the host compiler and C runtime libraries, and is therefore only supported on
-    distribution versions that have been qualified for this CUDA Toolkit release.
-        
-    After downloading cuDNN library, issue the following commands:
-
-    ```bash
-    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-    ```
-    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
-
-    ```bash
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
-    ```
-
-### Build and Install
-
-As usual, the best option is to create build folder under paddle project directory.
-
-```bash
-mkdir build && cd build
-``` 
-
-Finally, you can build and install PaddlePaddle:
-  
-```bash
-# you can add build option here, such as:    
-cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
-# please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
-# set PaddlePaddle installation path in ~/.bashrc
-export PATH=<path to install>/bin:$PATH
-# install PaddlePaddle Python modules.
-sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-```
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..76fbc43de2e83580dd79b874507c103533022436
--- /dev/null
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -0,0 +1,159 @@
+Build from Sources
+==========================
+
+.. _build_step:
+
+How To Build
+----------------
+
+PaddlePaddle mainly uses `CMake <https://cmake.org>`_ and GCC, G++ as compile
+tools. We recommend you to use our pre-built Docker image to run the build
+to avoid installing dependencies by yourself. We have several build environment
+Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ .
+
+If you choose not to use Docker image for your build, you need to install the
+below `Compile Dependencies`_ before run the build.
+
+Then run:
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # run the following command to build a CPU-Only binaries if you are using docker
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   # else run these commands
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+
+When the compile finishes, you can get the output whl package under
+build/python/dist, then you can choose to install the whl on local
+machine or copy it to the target machine.
+
+.. code-block:: bash
+
+   pip install python/dist/*.whl
+
+
+.. _run_test:
+
+Run Tests
+----------------
+
+If you wish to run the tests, you may follow the below steps:
+
+When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
+Set :code:`WITH_GPU=ON` Can also run tests on GPU.
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+
+If you don't use Docker, just run ctest will start the tests:
+
+.. code-block:: bash
+
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=ON ..
+   make
+   ctest
+   # run a single test like test_mul_op
+   ctest -R test_mul_op
+
+
+.. _compile_deps:
+
+Compile Dependencies
+----------------
+
+PaddlePaddle need the following dependencies when compiling, other dependencies
+will be downloaded automatically.
+
+.. csv-table:: PaddlePaddle Compile Dependencies
+   :header: "Dependency", "Version", "Description"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.5", ""
+   "GCC", "4.8.2", "Recommend devtools2 for CentOS"
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "Optional"
+
+
+.. _build_options:
+
+Build Options
+----------------
+
+Build options include whether build binaries for CPU or GPU, which BLAS
+library to use etc. You may pass these settings when running cmake.
+For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`_ 。
+
+.. _build_options_bool:
+
+Bool Type Options
+----------------
+
+You can add :code:`-D` argument to pass such options, like:
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool Type Options
+    :header: "Option", "Description", "Default"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "Build with GPU support", "ON"
+    "WITH_C_API", "Build only CAPI", "OFF"
+    "WITH_DOUBLE", "Build with double precision", "OFF"
+    "WITH_DSO", "Dynamically load CUDA libraries", "ON"
+    "WITH_AVX", "Build with AVX support", "ON"
+    "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
+    "WITH_STYLE_CHECK", "Check code style when building", "ON"
+    "WITH_TESTING", "Build unit tests", "ON"
+    "WITH_DOC", "Build documentaions", "OFF"
+    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
+    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
+    "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
+
+
+BLAS
++++++
+
+PaddlePaddle supports `MKL <https://software.intel.com/en-us/intel-mkl>`_ and
+`OpenBlAS <http://www.openblas.net/>`_ as BLAS library。By default it uses MKL.
+If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded
+and used, for more `details <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ .
+
+If you choose not to use MKL, then OpenBlAS will be used.
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle will automatically find CUDA and cuDNN when compiling and running.
+parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture
+automatically in order to speed up the build.
+
+PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to
+keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN
+you built.
+
+Pass Compile Options
+++++++++++++++
+
+You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
+When running cmake command, it will search system paths like
+:code:`/usr/lib:/usr/local/lib` and then search paths that you
+passed to cmake, i.e.
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.**
diff --git a/doc/getstarted/build_and_install/cmake.png b/doc/getstarted/build_and_install/cmake.png
deleted file mode 100644
index a58cd09ad99cf27cc1ca5785fe54d726b83a82f6..0000000000000000000000000000000000000000
Binary files a/doc/getstarted/build_and_install/cmake.png and /dev/null differ
diff --git a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
deleted file mode 100644
index be0c1ffa451b2901ec06621dd4d886f800b4562e..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
+++ /dev/null
@@ -1,43 +0,0 @@
-PaddlePaddle的编译选项
-======================
-
-PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
-
-Bool型的编译选项
-----------------
-用户可在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如
-
-..  code-block:: bash
-
-    cmake .. -DWITH_GPU=OFF
-
-..  csv-table:: Bool型的编译选项
-    :widths: 1, 7, 2
-    :file: compile_options.csv
-
-BLAS/CUDA/Cudnn的编译选项
---------------------------
-BLAS
-+++++
-
-PaddlePaddle支持以下任意一种BLAS库：`MKL <https://software.intel.com/en-us/intel-mkl>`_ ，`ATLAS <http://math-atlas.sourceforge.net/>`_ ，`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
-
-..  csv-table:: BLAS路径相关的编译选项
-    :widths: 1, 2, 7
-    :file: cblas_settings.csv
-
-CUDA/Cudnn
-+++++++++++
-
-PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
-
-编译选项的设置
-++++++++++++++
-
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时，首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
-
-..  code-block:: bash
-
-    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
-
-注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
diff --git a/doc/getstarted/build_and_install/cmake/cblas_settings.csv b/doc/getstarted/build_and_install/cmake/cblas_settings.csv
deleted file mode 100644
index a6356baf16a0d3d2499e39d2055d8ee878dcaef2..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/cmake/cblas_settings.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-编译选项,描述,注意
-MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h，${MKL_ROOT}/lib目录下需要包含mkl_core，mkl_sequential和mkl_intel_lp64三个库。
-ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h，${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。
-OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h，${OPENBLAS_ROOT}/lib下需要包含openblas库。
-REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h，${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/cmake/compile_options.csv b/doc/getstarted/build_and_install/cmake/compile_options.csv
deleted file mode 100644
index 463b825470579d0c3736a408b1e82dd33e6f8d42..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/cmake/compile_options.csv
+++ /dev/null
@@ -1,12 +0,0 @@
-选项,说明,默认值
-WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
-WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA,否
-WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
-WITH_DOC,是否编译中英文文档,否
-WITH_SWIG_PY,是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 0d34dec8e908c5e61001500725187a2233797f46..f78b1fb0e11aa028a4b7abb5270740b97f8039e9 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -1,222 +1,139 @@
-PaddlePaddle的Docker容器使用方式
+使用Docker安装运行
 ================================
 
-PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行。 请注意，您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
+使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
+您可以在 `Docker官网 <https://docs.docker.com/get-started/>`_ 获得基本的Docker安装和使用方法。
 
-Docker使用入门
-------------------------------
-
-几个基础的概念帮助理解和使用Docker：
+如果您在使用Windows，可以参考
+`这篇 <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+教程，完成在Windows上安装和使用Docker。
 
-- *镜像*：一个Docker镜像是一个打包好的软件。它包含了这个软件本身和它所依赖的运行环境。PaddlePaddle的Docker镜像就包含了PaddlePaddle的Python库以及其依赖的多个Python库。这样我们可以直接在Docker中运行需要的程序而不需要安装后在执行。可以执行：
+在了解Docker的基本使用方法之后，即可开始下面的步骤：
 
-  .. code-block:: bash
+.. _docker_pull:
 
-     docker images
+获取PaddlePaddle的Docker镜像
+------------------------------
 
-  来列出当前系统中的所有镜像，同样可以执行：
+执行下面的命令获取最新的PaddlePaddle Docker镜像
 
   .. code-block:: bash
-		  
-     docker pull paddlepaddle/paddle:0.10.0
 
-  来下载Docker镜像，paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的，推荐国内用户使用docker.paddlepaddle.org/paddle下载。
+     docker pull paddlepaddle/paddle
 
-- *容器*： 如果说一个Docker镜像就是一个程序，那容器就是这个程序运行时产生的“进程”。
-  实际上，一个容器就是一个操作系统的进程，但是是运行在独立的进程空间，文件系统以及网络之上。
-  可以执行：
+对于国内用户，我们提供了加速访问的镜像源：
 
   .. code-block:: bash
 
-     docker run paddlepaddle/paddle:0.10.0
+     docker pull docker.paddlepaddle.org/paddle
 
-  来使用一个镜像启动一个容器。
-
-- 默认情况下，Docker容器会运行在独立的文件系统空间之上，我们无法在Docker容器中
-  访问到主机上的文件。可以通过*挂载Volume*的方式，将主机上的文件或目录挂载到
-  Docker容器中。下面的命令把当前目录挂载到了容器中的 /data 目录下，容器使用
-  debian镜像，并且启动后执行 :code:`ls /data`。
+下载GPU版本的Docker镜像：
 
   .. code-block:: bash
 
-     docker run --rm -v $(pwd):/data debian ls /data
-
-PaddlePaddle发布的Docker镜像使用说明
-------------------------------
-
-我们把PaddlePaddle的编译环境打包成一个镜像，称为开发镜像，里面涵盖了
-PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打包成一个镜
-像，称为生产镜像，里面涵盖了PaddlePaddle运行所需的所有环境。每次
-PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运
-行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在
-`dockerhub.com <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 
-和国内镜像`docker.paddlepaddle.org` 提供最新
-的Docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。
-
-**注意：为了方便在国内的开发者下载Docker镜像，我们提供了国内的镜像服务器供大家使用。如果您在国内，请把文档里命令中的paddlepaddle/paddle替换成docker.paddlepaddle.org/paddle。**
-
-1. 开发镜像：:code:`paddlepaddle/paddle:0.10.0-dev`
-
-   这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境，完成开发，编译，发布，
-   文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具，所以如果需要自行配置开发环境需要考虑版本的因素。
-   开发镜像包含了以下工具：
-   
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
-   很多开发者会使用远程的安装有GPU的服务器工作，用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作，
-   也可以在开发镜像中启动一个SSHD服务，方便开发者直接登录到镜像中进行开发:
-
-   以交互容器方式运行开发镜像：
-
-   .. code-block:: bash
-
-      docker run -it --rm -v $(pwd):/paddle  paddlepaddle/paddle:0.10.0-dev /bin/bash
-
-   或者，可以以后台进程方式运行容器：
-
-   .. code-block:: bash
-
-      docker run -d -p 2202:22 -p 8888:8888 -v $(pwd):/paddle paddlepaddle/paddle:0.10.0-dev /usr/sbin/sshd -D
-
-   然后用密码 :code:`root` SSH进入容器：
-
-   .. code-block:: bash
-
-      ssh -p 2202 root@localhost
-
-   SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
-
-2. 生产镜像：根据CPU、GPU和非AVX区分了如下4个镜像：
-
-   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
-   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
-   - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
-   - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
-
-   纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
-
-   .. code-block:: bash
-
-      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-   如果输出是No，就需要选择使用no-AVX的镜像
-
-   **注：在0.10.0之后的版本，PaddlePaddle都可以自动判断硬件是否支持AVX，所以无需判断AVX即可使用**
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddle.org/paddle:latest-gpu
 
-   以上方法在GPU镜像里也能用，只是请不要忘记提前在物理机上安装GPU最新驱动。
-   为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
+选择下载使用不同的BLAS库的Docker镜像：
 
-   .. code-block:: bash
-
-      nvidia-docker run -it --rm paddledev/paddle:0.10.0-gpu /bin/bash
+  .. code-block:: bash
 
-   注意: 如果使用nvidia-docker存在问题，你也许可以尝试更老的方法，具体如下，但是我们并不推荐这种方法。：
+     # 默认是使用MKL的镜像
+     docker pull paddlepaddle/paddle
+     # 使用OpenBLAS的镜像
+     docker pull paddlepaddle/paddle:latest-openblas
 
-   .. code-block:: bash
+下载指定版本的Docker镜像，可以从 `DockerHub网站 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag，并执行下面的命令：
 
-      export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-      export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-      docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0-gpu
+  .. code-block:: bash
 
-3. 运行以及发布您的AI程序
+     docker pull paddlepaddle/paddle:[tag]
+     # 比如：
+     docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
 
-   假设您已经完成了一个AI训练的python程序 :code:`a.py`，这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行：
+.. _docker_run:
 
-   .. code-block:: bash
+在Docker中执行PaddlePaddle训练程序
+------------------------------
 
-      docker run -it -v $PWD:/work paddle /work/a.py
+假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+编写），就可以使用下面的命令开始执行训练：
 
-   如果要使用GPU，请运行：
+  .. code-block:: bash
 
-   .. code-block:: bash
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+ 
+上述命令中， :code:`-it` 参数说明容器已交互式运行； :code:`-v $PWD:/work`
+指定将当前路径（Linux中$PWD变量会展开为当前路径的绝对路径）挂载到容器内部的 :code:`/work`
+目录； :code:`paddlepaddle/paddle` 指定需要使用的容器； 最后 :code:`/work/train.py`
+为容器内执行的命令，即运行训练程序。
 
-      nvidia-docker run -it -v $PWD:/work paddle /work/a.py
+当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
 
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
 
-   这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像，可以编写`Dockerfile`使用`FROM paddledev/paddle:0.10.0`
-   创建和发布自己的AI程序镜像。
+**注：PaddlePaddle Docker镜像为了减小体积，默认没有安装vim，您可以在容器中执行** :code:`apt-get install -y vim` **安装后，在容器中编辑代码。**
 
-运行PaddlePaddle Book
----------------------
+.. _docker_run_book:
 
-Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
+使用Docker启动PaddlePaddle Book教程
+------------------------------
 
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
 PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
 如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程，或者制作和分享带有代码、公式、图表、文字的交互式文档。
 
 我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
 
-.. code-block:: bash
+  .. code-block:: bash
 
-    docker run -p 8888:8888 paddlepaddle/book
+     docker run -p 8888:8888 paddlepaddle/book
 
 然后在浏览器中输入以下网址：
 
-.. code-block:: text
+  .. code-block:: text
 
-    http://localhost:8888/
+     http://localhost:8888/
 
 就这么简单，享受您的旅程！
 
-通过Docker容器开发PaddlePaddle
-------------------------------
-
-开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux，Mac OS X和Windows。
+.. _docker_run_gpu:
 
-1. 制作PaddlePaddle开发镜像
-
-   PaddlePaddle每次发布新版本都会发布对应的开发镜像供开发者直接使用。这里介绍如生成造这个开发镜像。
-   生成Docker镜像的方式有两个，一个是直接把一个容器转换成镜像，另一个是创建Dockerfile并运行docker build指令按照Dockerfile生成镜像。第一个方法的好处是简单快捷，适合自己实验，可以快速迭代。第二个方法的好处是Dockerfile可以把整个生成流程描述很清楚，其他人很容易看懂镜像生成过程，持续集成系统也可以简单地复现这个过程。我们采用第二个方法。Dockerfile位于PaddlePaddle repo的根目录。生成生产镜像只需要运行：
-
-   .. code-block:: bash
-      
-      git clone https://github.com/PaddlePaddle/Paddle.git
-      cd Paddle
-      docker build -t paddle:dev .
-
-   docker build这个命令的-t指定了生成的镜像的名字，这里我们用paddle:dev。到此，PaddlePaddle开发镜像就被构建完毕了。
+使用Docker执行GPU训练
+------------------------------
 
-2. 制作PaddlePaddle生产镜像
+为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_ 来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
 
-   生产镜像的生成分为两步，第一步是运行：
+  .. code-block:: bash
 
-   .. code-block:: bash
-      
-      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
+     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
 
-   以上命令会编译PaddlePaddle，生成运行程序，以及生成创建生产镜像的Dockerfile。所有生成的的文件都在build目录下。“WITH_GPU”控制生成的生产镜像是否支持GPU，“WITH_AVX”控制生成的生产镜像是否支持AVX，”WITH_TEST“控制是否生成单元测试。
+**注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
 
-   第二步是运行：
+  .. code-block:: bash
 
-   .. code-block:: bash
-      
-      docker build -t paddle:prod -f build/Dockerfile ./build
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
 
-   以上命令会按照生成的Dockerfile把生成的程序拷贝到生产镜像中并做相应的配置，最终生成名为paddle:prod的生产镜像。
+**关于AVX：**
 
-3. 运行单元测试
+AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
+`编译 <./build_from_source_cn.rst>`_ PaddlePaddle为no-avx版本。
 
-   运行以下指令：
+以下指令能检查Linux电脑是否支持AVX：
 
    .. code-block:: bash
-      
-      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
-
-文档
-----
-
-Paddle的Docker开发镜像带有一个通过 `woboq code browser
-<https://github.com/woboq/woboq_codebrowser>`_ 生成的HTML版本的C++源代码，便于用户浏览C++源码。
 
-只要在Docker里启动PaddlePaddle的时候给它一个名字，就可以再运行另一个Nginx Docker镜像来服务HTML代码：
-
-.. code-block:: bash
-
-   docker run -d --name paddle-cpu-doc paddle:0.10.0-dev
-   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
 
-接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。
+如果输出是No，就需要选择使用no-AVX的镜像
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 94860240f6a4a9bed8a865684a8a79960489280e..d7acc7aeb744b19d83acb520d07c8551168dd096 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -1,270 +1,146 @@
-PaddlePaddle in Docker Containers
+Run in Docker Containers
 =================================
 
-Docker container is currently the only officially-supported way to
-running PaddlePaddle.  This is reasonable as Docker now runs on all
-major operating systems including Linux, Mac OS X, and Windows.
-Please be aware that you will need to change `Dockers settings
-<https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use
-of your hardware resource on Mac OS X and Windows.
+Run PaddlePaddle in Docker container so that you don't need to care about
+runtime dependencies, also you can run under Windows system. You can get
+tutorials at `here <https://docs.docker.com/get-started/>`_ .
 
-Working With Docker
--------------------
+If you are using Windows, please refer to
+`this <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+tutorial to start running docker under windows.
 
-Docker is simple as long as we understand a few basic concepts:
+After you've read above tutorials you may proceed the following steps.
 
-- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type
+.. _docker_pull:
 
-  .. code-block:: bash
-
-     docker images
+Pull PaddlePaddle Docker Image
+------------------------------
 
-  to list all images in the system. We can also run
+Run the following command to download the latest Docker images:
 
   .. code-block:: bash
-		  
-     docker pull paddlepaddle/paddle:0.10.0
 
-  to download a Docker image, paddlepaddle/paddle in this example,
-  from Dockerhub.com.
+     docker pull paddlepaddle/paddle
 
-- *container*: considering a Docker image a program, a container is a
-  "process" that runs the image. Indeed, a container is exactly an
-  operating system process, but with a virtualized filesystem, network
-  port space, and other virtualized environment. We can type
+For users in China, we provide a faster mirror:
 
   .. code-block:: bash
 
-     docker run paddlepaddle/paddle:0.10.0
+     docker pull docker.paddlepaddle.org/paddle
 
-  to start a container to run a Docker image, paddlepaddle/paddle in this example.
-
-- By default docker container have an isolated file system namespace,
-  we can not see the files in the host file system. By using *volume*,
-  mounted files in host will be visible inside docker container.
-  Following command will mount current dirctory into /data inside
-  docker container, run docker container from debian image with
-  command :code:`ls /data`.
+Download GPU version images:
 
   .. code-block:: bash
 
-     docker run --rm -v $(pwd):/data debian ls /data
-
-Usage of CPU-only and GPU Images
-----------------------------------
-
-We package PaddlePaddle's compile environment into a Docker image,
-called the develop image, it contains all compiling tools that
-PaddlePaddle needs. We package compiled PaddlePaddle program into a
-Docker image as well, called the production image, it contains all
-runtime environment that running PaddlePaddle needs. For each version
-of PaddlePaddle, we release both of them. Production image includes
-CPU-only version and a CUDA GPU version and their no-AVX versions.
-
-We put the docker images on `dockerhub.com
-<https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_. You can find the
-latest versions under "tags" tab at dockerhub.com. 
-
-** NOTE: If you are in China, you can use our Docker image registry mirror to speed up the download process. To use it, please replace all paddlepaddle/paddle in the commands to docker.paddlepaddle.org/paddle.**
-
-
-1. development image :code:`paddlepaddle/paddle:<version>-dev`
-
-   This image has packed related develop tools and runtime
-   environment. Users and developers can use this image instead of
-   their own local computer to accomplish development, build,
-   releasing, document writing etc. While different version of paddle
-   may depends on different version of libraries and tools, if you
-   want to setup a local environment, you must pay attention to the
-   versions.  The development image contains:
-   
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
-     
-   Many developers use servers with GPUs, they can use ssh to login to
-   the server and run :code:`docker exec` to enter the docker
-   container and start their work.  Also they can start a development
-   docker image with SSHD service, so they can login to the container
-   and start work.
-
-2. Production images, this image might have multiple variants:
-
-   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
-   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
-   - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
-   - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
-
-   Please be aware that the CPU-only and the GPU images both use the
-   AVX instruction set, but old computers produced before 2008 do not
-   support AVX.  The following command checks if your Linux computer
-   supports AVX:
-
-   .. code-block:: bash
-
-      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-   **NOTE：versions after 0.10.0 will automatically detect system AVX support, so manual detect is not needed in this case.**
-   To run the CPU-only image as an interactive container:
-
-   .. code-block:: bash
-
-      docker run -it --rm paddlepaddle/paddle:0.10.0 /bin/bash
-
-   Above method work with the GPU image too -- the recommended way is
-   using `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_.
-
-   Please install nvidia-docker first following this `tutorial
-   <https://github.com/NVIDIA/nvidia-docker#quick-start>`_.
-
-   Now you can run a GPU image:
-
-   .. code-block:: bash
-
-      nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash
-
-
-Train Model Using Python API
-----------------------------
-
-Our official docker image provides a runtime for PaddlePaddle
-programs. The typical workflow will be as follows:
-
-Create a directory as workspace:
-
-.. code-block:: bash
-
-   mkdir ~/workspace
-
-Edit a PaddlePaddle python program using your favourite editor
-
-.. code-block:: bash
-
-   emacs ~/workspace/example.py
-
-Run the program using docker:
-
-.. code-block:: bash
-
-   docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 python /workspace/example.py
-
-Or if you are using GPU for training:
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddle.org/paddle:latest-gpu
 
-.. code-block:: bash
+Choose between different BLAS version:
 
-   nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu python /workspace/example.py
-
-Above commands will start a docker container by running :code:`python
-/workspace/example.py`. It will stop once :code:`python
-/workspace/example.py` finishes.
-
-Another way is to tell docker to start a :code:`/bin/bash` session and
-run PaddlePaddle program interactively:
-
-.. code-block:: bash
-
-   docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 /bin/bash
-   # now we are inside docker container
-   cd /workspace
-   python example.py
-
-Running with GPU is identical:
-
-.. code-block:: bash
-
-   nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu /bin/bash
-   # now we are inside docker container
-   cd /workspace
-   python example.py
-
-
-Develop PaddlePaddle or Train Model Using C++ API
----------------------------------------------------
-
-We will be using PaddlePaddle development image since it contains all
-compiling tools and dependencies.
+  .. code-block:: bash
 
-1. Build PaddlePaddle develop image
+     # image using MKL by default
+     docker pull paddlepaddle/paddle
+     # image using OpenBLAS
+     docker pull paddlepaddle/paddle:latest-openblas
 
-   Use following command to build PaddlePaddle develop image:
 
-   .. code-block:: bash
+If you want to use legacy versions, choose a tag from
+`DockerHub <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_
+and run:
 
-      git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle
-      docker build -t paddle:dev .
-
-2. Build PaddlePaddle production image
+  .. code-block:: bash
 
-   There are two steps for building production image, the first step is to run:
+     docker pull paddlepaddle/paddle:[tag]
+     # i.e.
+     docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
 
-   .. code-block:: bash
+.. _docker_run:
 
-      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
+Launch your training program in Docker
+------------------------------
 
-   The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated.
+Assume that you have already written a PaddlePaddle program
+named :code:`train.py` under directory :code:`/home/work` (refer to 
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+for more samples), then run the following command:
 
-   The second step is to run:
+  .. code-block:: bash
 
-   .. code-block:: bash
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
 
-      docker build -t paddle:prod -f build/Dockerfile ./build
+In the above command, :code:`-it` means run the container interactively;
+:code:`-v $PWD:/work` means mount the current directory ($PWD will expand
+to current absolute path in Linux) under :code:`/work` in the container.
+:code:`paddlepaddle/paddle` to specify image to use; finnally
+:code:`/work/train.py` is the command to run inside docker.
 
-   The above command will generate the production image by copying the compiled PaddlePaddle program into the image.
+Also, you can go into the container shell, run or debug your code
+interactively:
 
-3. Run unit test
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
 
-   Following command will run unit test:
+**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.**
 
-   .. code-block:: bash
-      
-      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+.. _docker_run_book:
 
 PaddlePaddle Book
 ------------------
 
-The Jupyter Notebook is an open-source web application that allows
-you to create and share documents that contain live code, equations,
-visualizations and explanatory text in a single browser.
-
-PaddlePaddle Book is an interactive Jupyter Notebook for users and developers.
-We already exposed port 8888 for this book. If you want to
+You can create a container serving PaddlePaddle Book using Jupyter Notebook in
+one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook
+for users and developers.If you want to
 dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
 
 We provide a packaged book image, simply issue the command:
 
-.. code-block:: bash
+  .. code-block:: bash
 
-    docker run -p 8888:8888 paddlepaddle/book
+     docker run -p 8888:8888 paddlepaddle/book
 
 Then, you would back and paste the address into the local browser:
 
-.. code-block:: text
+  .. code-block:: text
 
-    http://localhost:8888/
+     http://localhost:8888/
 
 That's all. Enjoy your journey!
 
+.. _docker_run_gpu:
 
-Documentation
--------------
+Train with Docker with GPU
+------------------------------
 
-Paddle Docker images include an HTML version of C++ source code
-generated using `woboq code browser
-<https://github.com/woboq/woboq_codebrowser>`_.  This makes it easy
-for users to browse and understand the C++ source code.
+We recommend using
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_
+to run GPU training jobs. Please ensure you have latest
+GPU driver installed before move on.
 
-As long as we give the Paddle Docker container a name, we can run an
-additional Nginx Docker container to serve the volume from the Paddle
-container:
+  .. code-block:: bash
 
-.. code-block:: bash
+     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
 
-   docker run -d --name paddle-cpu-doc paddle:<version>
-   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
+**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
 
+  .. code-block:: bash
 
-Then we can direct our Web browser to the HTML version of source code
-at http://localhost:8088/paddle/
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
+
+**About AVX:**
+
+AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
+The latest PaddlePaddle Docker image turns AVX on by default, so, if your
+computer doesn't support AVX, you'll probably need to
+`build <./build_from_source_en.rst>`_ with :code:`WITH_AVX=OFF`.
+
+The following command will tell you whether your computer supports AVX.
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst
index dd9923697ab85825557aa89a08870bece7c76673..88c5142ddee994ed0c0dc520195311e97f5a549e 100644
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -6,12 +6,13 @@
 安装流程
 ++++++++
 
-PaddlePaddle提供Docker镜像来部署环境。
+PaddlePaddle提供pip和Docker的安装方式：
 
 .. toctree::
    :maxdepth: 1
-   
-   docker_install_cn.rst 
+
+   pip_install_cn.rst
+   docker_install_cn.rst
 
 
 编译流程
@@ -19,9 +20,14 @@ PaddlePaddle提供Docker镜像来部署环境。
 
 ..  warning::
 
-    编译流程主要推荐高级用户查看，普通用户请走安装流程。
+    建议直接使用上述安装流程，方便快速安装。只有在遇到需要独立定制的二进制时才需要编译。
 
 ..  toctree::
     :maxdepth: 1
 
-    cmake/build_from_source_cn.rst
+    build_from_source_cn.rst
+
+常见问题解答
+++++++++++
+
+`常见问题解答 <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_cn.html>`_
diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/getstarted/build_and_install/index_en.rst
index 8a53588e0439df8f4d5fd529b7a20262c67d4e58..c8b60d03578ba6a9b73134ec53b440d057e36079 100644
--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
@@ -1,22 +1,33 @@
 Install and Build
 =================
 
-Install PaddlePaddle
-----------------------
+.. _install_steps:
 
-..  toctree::
-    :maxdepth: 1
+Install Steps
+++++++++
+
+You can choose either pip or Docker to complete your install:
+
+.. toctree::
+   :maxdepth: 1
+
+   pip_install_en.rst
+   docker_install_en.rst
 
-    docker_install_en.rst
 
 Build from Source
 -----------------
 
 ..  warning::
 
-    Please use :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
+    We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary.
 
 ..  toctree::
     :maxdepth: 1
 
     build_from_source_en.md
+
+FAQ
+++++++++++
+
+`FAQ <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_en.html>`_
diff --git a/doc/getstarted/build_and_install/paddleci.png b/doc/getstarted/build_and_install/paddleci.png
new file mode 100644
index 0000000000000000000000000000000000000000..16087ce059aa3c07ce8c927d983eb86351915825
Binary files /dev/null and b/doc/getstarted/build_and_install/paddleci.png differ
diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b26bf4c95cb18f36408eb75894e8b9b674efc67b
--- /dev/null
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@@ -0,0 +1,86 @@
+使用pip安装
+================================
+
+PaddlePaddle可以使用常用的Python包管理工具
+`pip <https://pip.pypa.io/en/stable/installing/>`_
+完成安装，并可以在大多数主流的Linux操作系统以及MacOS上执行。
+
+.. _pip_install:
+
+使用pip安装
+------------------------------
+
+
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+
+如果需要安装支持GPU的版本，需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+如果需要获取并安装最新的（开发分支）PaddlePaddle，可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装，
+您可以从下面的表格中找到需要的版本：
+
+如果在点击下面链接时出现如下登陆界面，点击“Log in as guest”即可开始下载：
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: 各个版本最新的whl包
+    :header: "版本说明", "cp27-cp27mu", "cp27-cp27mu", "C-API"
+    :widths: 1, 3, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+
+.. _pip_dependency:
+
+运行环境依赖
+------------------------------
+
+PaddlePaddle安装包由于不仅仅包含.py程序，而且包含了C++编写的部分，所以我们确保发布的二进制包可以支持主流的Linux操作系统，比如CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上。
+
+PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_ 标准，通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上，而且CentOS 5即将停止维护，所以我们默认使用CentOS 6作为标准编译环境。
+
+.. csv-table:: PaddlePaddle环境依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "操作系统", "Linux, MacOS", "CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上"
+   "Python", "2.7.x", "暂时不支持Python3"
+   "libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号"
+   "libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号"
+
+.. _pip_faq:
+
+安装常见问题和解决方法
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+  
+  出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip：
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  如果仍然存在问题，可以执行：
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包可以在 `这个 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 链接中找到。
+
+  如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新； 如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64 ，可以重命名这个whl包为 manylinux1_x86_64 再安装。
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..113790e4e4ca116e91f11f8a233eae874d9d1b7a
--- /dev/null
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@@ -0,0 +1,104 @@
+Install Using pip
+================================
+
+You can use current widely used Python package management
+tool `pip <https://pip.pypa.io/en/stable/installing/>`_
+to install PaddlePaddle. This method can be used in
+most of current Linux systems or MacOS.
+
+.. _pip_install:
+
+Install Using pip
+------------------------------
+
+Run the following command to install PaddlePaddle on the current
+machine, it will also download requirements.
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+
+If you wish to install GPU version, just run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+If you wish to install the latest develop branch PaddlePaddle, 
+you can download the latest whl package from our CI system. Access
+the below links, log in as guest, then click at the "Artifact"
+tab, you'll find the download link of whl packages.
+
+If the links below shows up the login form, just click "Log in as guest" to start the download:
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: whl package of each version
+    :header: "version", "cp27-cp27mu", "cp27-cp27mu", "C-API"
+    :widths: 1, 3, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+
+.. _pip_dependency:
+
+Runtime Dependency
+------------------------------
+
+PaddlePaddle installation packages (whl) does not only contain .py files,
+but also binaries built from C++ code. We ensure that PaddlePaddle can
+run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04
+and MacOS 10.12.
+
+PaddlePaddle whl packages are trying to satisfy
+`manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_
+standard, which uses CentOS 5 as default build environment. But CUDA libraries
+seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime,
+so we use CentOS 6 as default build environment.
+
+.. csv-table:: PaddlePaddle Runtime Deps
+   :header: "Dependency", "version", "description"
+   :widths: 10, 15, 30
+
+   "OS", "Linux, MacOS", "CentOS 6 or later，Ubuntu 14.04 or later，MacOS 10.12 or later"
+   "Python", "2.7.x", "Currently Python3 is not supported"
+   "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols"
+   "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols"
+
+.. _pip_faq:
+
+FAQ
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+  
+  The main cause of this issue is that your current platform is
+  not supported. Please check that you are using Python 2.7 series.
+  Besides, pypi only supports manylinux1 standard, you'll need to
+  upgrade your pip to >9.0.0. Then run the below command:
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  If the problem still exists, run the following command:
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  Then you'll get supported package suffixes, then check if it matches
+  the file name of the whl package. You can find default whl package at
+  `here <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_
+
+  If your system supports linux_x86_64 but the whl package is manylinux1_x86_64,
+  you'll need to update pip to the latest version; If your system supports
+  manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the
+  file to manylinux1_x86_64 suffix and then install.
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
index aa418c657a4ba16cce61c030066f4d3e14e891cc..a9087be6f350c5656cabb0c64ba0f200d1c666cc 100644
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -1,10 +1,61 @@
 新手入门
 ============
 
+.. _quick_install:
+
+快速安装
+++++++++
+
+PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
+执行下面的命令完成快速安装：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+如果需要安装支持GPU的版本，需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+更详细的安装和编译方法参考：
+
 ..  toctree::
   :maxdepth: 1
 
   build_and_install/index_cn.rst
-  concepts/use_concepts_cn.rst
 
-- `深度学习入门课程 <http://book.paddlepaddle.org/index.cn.html>`_
+.. _quick_start:
+
+快速开始
+++++++++
+
+创建一个 housing.py 并粘贴此Python代码：
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/use_concepts_cn.rst
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index be3253e3d41b99a2b696e2c5ef6463ed49680d69..d14e3f5c0cc90792fce9cb82e65da482c44dc433 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -1,9 +1,61 @@
 GET STARTED
 ============
 
+.. _quick_install:
+
+Quick Install
+----------------------
+
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+If you need to install GPU version, run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+For more details about installation and build:
+
 ..  toctree::
   :maxdepth: 1
 
   build_and_install/index_en.rst
 
-- `Deep Learning 101 <http://book.paddlepaddle.org/index.html>`_
+
+.. _quick_start:
+
+Quick Start
+++++++++
+
+Create a new file called housing.py, and paste this Python
+code:
+
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index 76d3e0a0092f89005605a23e14e712530112a5ac..eb95356c67c5df22e4f543f958eb31d79f2c6195 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -19,7 +19,6 @@
 ..  toctree::
   :maxdepth: 1
 
-  dev/build_cn.rst
   dev/write_docs_cn.rst
 
 模型配置
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 1b6034be4edffd2cbc822018b733b9a3836ea84a..1fbfcd260b912078f00ed5b720ed607db725c4e2 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -18,7 +18,6 @@ Development
 ..  toctree::
   :maxdepth: 1
 
-  dev/build_en.rst
   dev/new_layer_en.rst
   dev/contribute_to_paddle_en.md
 
diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling.md
index 32d89a7c183d57e0e69039dfb2c78703d9866f7c..b3330b0b59d65d81d565d553349c39945ef82e42 100644
--- a/doc/howto/optimization/cpu_profiling.md
+++ b/doc/howto/optimization/cpu_profiling.md
@@ -71,7 +71,7 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 
 ```
 
-可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python与C++混合代码的性能分析`来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
 
 ```text
 Called By:
@@ -121,7 +121,7 @@ python -m yep -v main.py
 
 1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
 2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
-3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟如果单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
 
 ### 查看性能分析文件
 
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index d5b55e1c95f248f551e6a0a3b39123169dd7784f..30f3a766f0c65187c8f2dd4603e3d26c9b9a6a3d 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -55,7 +55,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
 }
 
 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value) {
+                                            paddle_real* value) {
   if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
@@ -75,7 +75,7 @@ PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
 }
 
 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result) {
+                                            paddle_real* result) {
   if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h
index 01b8bad2ee9f528f8622346f43b9ff82225a7e73..8cc3e0034e058daefc63c69efe0b1f575c586897 100644
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -79,7 +79,7 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
  * @note  value should contain enough element of data to init the mat
  */
 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value);
+                                            paddle_real* value);
 
 /**
  * @brief PDMatGetRow Get raw row buffer from matrix
@@ -93,14 +93,14 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                           paddle_real** rawRowBuffer);
 
 /**
- * @brief copy data from the matrix 
+ * @brief copy data from the matrix
  * @param [in] mat Target matrix
- * @param [out] result pointer to store the matrix data 
+ * @param [out] result pointer to store the matrix data
  * @return paddle_error
  * @note the space of the result should allocated before invoke this API
  */
 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result);
+                                            paddle_real* result);
 /**
  * @brief PDMatCreateNone Create None Matrix
  * @return
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index a0f2906c749054c1ff9f624e47df432ec2bd6ac8..fdf6de4babff3bb3c253aaf516636882237e6faf 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -13,6 +13,8 @@
    limitations under the License. */
 
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
 
 #include "paddle/memory/memcpy.h"
 #include "paddle/memory/memory.h"
@@ -27,11 +29,11 @@
 namespace paddle {
 namespace framework {
 
-std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+std::ostream &operator<<(std::ostream &os, const LoD &lod) {
   os << "{";
-  for (auto& v : lod) {
+  for (auto &v : lod) {
     os << "{";
-    for (auto& i : v) {
+    for (auto &i : v) {
       os << i << ",";
     }
     os << "}";
@@ -41,7 +43,7 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) {
   return os;
 }
 
-LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
+LoD SliceLevels(const LoD &in, size_t level_begin, size_t level_end) {
   LoD new_lod;
   new_lod.reserve(level_end - level_begin);
   for (size_t i = level_begin; i < level_end; i++) {
@@ -53,7 +55,7 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
   return new_lod;
 }
 
-LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
+LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
                  size_t elem_end) {
   PADDLE_ENFORCE_LT(level, in.size());
   PADDLE_ENFORCE_LT(elem_end, in[level].size());
@@ -64,9 +66,9 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
   res[0].assign(in[level].begin() + elem_begin,
                 in[level].begin() + elem_end + 1);
   for (size_t lvl = 1; lvl < res.size(); lvl++) {
-    const auto& in_level = in[level + lvl];
-    const auto& above_level = res[lvl - 1];
-    auto& out_level = res[lvl];
+    const auto &in_level = in[level + lvl];
+    const auto &above_level = res[lvl - 1];
+    auto &out_level = res[lvl];
     out_level.assign(in_level.begin() + above_level.front(),
                      in_level.begin() + above_level.back() + 1);
   }
@@ -74,33 +76,33 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
     // to make the first offset equals 0, all the elements minus the first
     // element
     size_t front = res[lvl].front();
-    for (auto& ele : res[lvl]) {
+    for (auto &ele : res[lvl]) {
       ele -= front;
     }
   }
   return res;
 }
 
-LoD ToAbsOffset(const LoD& in) {
+LoD ToAbsOffset(const LoD &in) {
   // the lowest level stores relative offsets
   if (in.empty() || in.size() == 1) return in;
   LoD result = in;
   for (int level = result.size() - 2; level >= 0; level--) {
-    for (auto& ele : result[level]) {
+    for (auto &ele : result[level]) {
       ele = result[level + 1][ele];
     }
   }
   return result;
 }
 
-bool operator==(const LoD& a, const LoD& b) {
+bool operator==(const LoD &a, const LoD &b) {
   if (a.size() != b.size()) {
     return false;
   }
 
   for (size_t i = 0; i < a.size(); i++) {
-    const auto& a_level = a[i];
-    const auto& b_level = b[i];
+    const auto &a_level = a[i];
+    const auto &b_level = b[i];
     if (a_level.size() != b_level.size()) {
       return false;
     }
@@ -151,7 +153,7 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
 }
 
 using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
-LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
                                         size_t end_idx, size_t start_level) {
   LoD sub_lod;
 
@@ -170,7 +172,7 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
   return LoDAndOffset{sub_lod, {start_idx, end_idx}};
 }
 
-void AppendLoD(LoD* lod, const LoD& lod_length) {
+void AppendLoD(LoD *lod, const LoD &lod_length) {
   PADDLE_ENFORCE(
       lod->empty() || lod->size() == lod_length.size(),
       "The lod_length should has the same size with the appended lod.");
@@ -178,12 +180,139 @@ void AppendLoD(LoD* lod, const LoD& lod_length) {
     *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
   }
   for (size_t i = 0; i < lod->size(); ++i) {
-    auto& level = (*lod)[i];
+    auto &level = (*lod)[i];
     for (size_t len : lod_length[i]) {
       level.push_back(level.back() + len);
     }
   }
 }
 
+void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
+                       const platform::DeviceContext &dev_ctx) {
+  // TODO(typhoonzero): serialize to ostream
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+  }
+  {  // the 2nd field, tensor description
+     // int32_t  size
+     // void*    protobuf message
+    framework::TensorDesc desc;
+    desc.set_data_type(framework::ToDataType(tensor.type()));
+    auto dims = framework::vectorize(tensor.dims());
+    auto *pb_dims = desc.mutable_dims();
+    pb_dims->Resize(static_cast<int>(dims.size()), 0);
+    std::copy(dims.begin(), dims.end(), pb_dims->begin());
+    int32_t size = desc.ByteSize();
+    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+    auto out = desc.SerializeAsString();
+    os.write(out.data(), size);
+  }
+  {  // the 3rd field, tensor data
+    uint64_t size = tensor.memory_size();
+    auto *data_ptr = tensor.data<void>();
+    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
+                   "Index overflow when writing tensor");
+    if (platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto &gpu_dev_ctx =
+          static_cast<const platform::CUDADeviceContext &>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     boost::get<platform::GPUPlace>(tensor.place()),
+                     reinterpret_cast<const void *>(data), size_to_write,
+                     gpu_dev_ctx.stream());
+        gpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      os.write(static_cast<const char *>(data_ptr),
+               static_cast<std::streamsize>(size));
+    }
+  }
+  {  // the 4th field, lod information
+     // uint64_t lod_level
+     // uint64_t lod_level_1 size in byte.
+     // int*     lod_level_1 data
+     // ...
+    auto lod = tensor.lod();
+    uint64_t size = lod.size();
+    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+
+    for (auto &each : lod) {
+      size = each.size() * sizeof(framework::LoD::value_type::value_type);
+      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+      os.write(reinterpret_cast<const char *>(each.data()),
+               static_cast<std::streamsize>(size));
+    }
+  }
+}
+
+void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
+  uint32_t version;
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  framework::TensorDesc desc;
+  {  // int32_t size
+     // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char *>(buf.get()), size);
+    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+                   "Cannot parse tensor desc");
+  }
+  {  // read tensor
+    std::vector<int64_t> dims;
+    dims.reserve(static_cast<size_t>(desc.dims().size()));
+    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+    tensor->Resize(framework::make_ddim(dims));
+
+    void *buf;
+    platform::Place cpu = platform::CPUPlace();
+    switch (desc.data_type()) {
+      case framework::FP32:
+        buf = tensor->mutable_data<float>(cpu);
+        break;
+      case framework::FP64:
+        buf = tensor->mutable_data<double>(cpu);
+        break;
+      case framework::INT32:
+        buf = tensor->mutable_data<int>(cpu);
+        break;
+      case framework::INT64:
+        buf = tensor->mutable_data<int64_t>(cpu);
+        break;
+      default:
+        PADDLE_THROW("DataType %d not supported", desc.data_type());
+    }
+    is.read(static_cast<char *>(buf), tensor->memory_size());
+  }
+  {  // read lod
+    uint64_t lod_level;
+    is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+    auto &lod = *tensor->mutable_lod();
+    lod.resize(lod_level);
+    for (uint64_t i = 0; i < lod_level; ++i) {
+      uint64_t size;
+      is.read(reinterpret_cast<char *>(&size), sizeof(size));
+      std::vector<size_t> tmp(size / sizeof(size_t));
+      is.read(reinterpret_cast<char *>(tmp.data()),
+              static_cast<std::streamsize>(size));
+      lod[i] = tmp;
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 21bdfca1111f16d5b8ea71be004ddb8da12fd03c..9411c96aea4c10ebf921cc3e3b442769c8acbefa 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -189,5 +189,14 @@ std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
 
 void AppendLoD(LoD* lod, const LoD& lod_length);
 
+/*
+ * Serialize/Desiralize LoDTensor to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
+                       const platform::DeviceContext& dev_ctx);
+void DeserializeFromStream(std::istream& is, LoDTensor* tensor);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 48cd131550dea5ad3f368b25c31d753efbe0dff9..02a825324328fa5cfd3a4d23a8c64488cc88aeec 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -65,7 +65,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
                       "The %d-th output of Output(%s) must be LoDTensor.", j,
                       out);
-    in_var->SetLoDLevel(out_var->GetLodLevel());
+    out_var->SetLoDLevel(in_var->GetLodLevel());
   }
   bool IsRuntime() const override;
 
diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h
index 8ee2e15a59113e6d17513045e6baa58f8da9026e..4e34b90d57eed8fea84b83045df61a98483c8849 100644
--- a/paddle/framework/tensor_util.h
+++ b/paddle/framework/tensor_util.h
@@ -135,18 +135,17 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
   auto dst_ptr = static_cast<void*>(dst->data());
 
   if (platform::is_cpu_place(src.place())) {
-    memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()), 
-		    src_ptr, size);
+    memory::Copy(dst_place, dst_ptr,
+                 boost::get<platform::CPUPlace>(src.place()), src_ptr, size);
   }
 #ifdef PADDLE_WITH_CUDA
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
     memory::Copy(
-        dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()), src_ptr,
-        size,
+        dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()),
+        src_ptr, size,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   }
 #endif
-
 }
 
 }  // namespace framework
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..be26b9ba88c279036f73b0a0baaff164755fe067
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FactorizationMachineLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
+
+bool FactorizationMachineLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  factorSize_ = config_.factor_size();
+
+  /* initialize the latentVectors_ */
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t inputSize = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
+  latentVectors_ = std::unique_ptr<Weight>(
+      new Weight(inputSize, factorSize_, parameters_[0]));
+
+  return true;
+}
+
+void FactorizationMachineLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const MatrixPtr& inputV = getInputValue(0);
+
+  size_t batchSize = inputV->getHeight();
+  size_t outputSize = getSize();
+  size_t inputSize = inputLayers_[0]->getSize();
+  reserveOutput(batchSize, outputSize);
+
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(
+      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
+
+  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
+  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
+  inputMulFactor_->square2(*tmpOut_);
+  outV->sumRows(*tmpOut_, 0.5, 0);
+
+  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
+                                       inputV->getHeight(),
+                                       inputV->getWidth(),
+                                       inputV->getElementCnt(),
+                                       inputV->getValueType());
+    inputSquare_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
+  } else {
+    Matrix::resizeOrCreate(
+        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+    inputV->square2(*inputSquare_);
+  }
+  latentVectors_->getW()->square2(*latentVectorsSquare_);
+  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
+  outV->sumRows(*tmpOut_, -0.5, 1.0);
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  const MatrixPtr& inputV = getInputValue(0);
+  const MatrixPtr& oGrad = getOutputGrad();
+
+  Matrix::resizeOrCreate(
+      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
+                                         latentVectors_->getW()->getHeight(),
+                                         1,
+                                         false,
+                                         useGpu_);
+
+  /* Calculate the gradients of the latentVectors_ matrix */
+  if (latentVectors_->getWGrad()) {
+    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
+                                         inputV->getHeight(),
+                                         inputV->getWidth(),
+                                         inputV->getElementCnt());
+
+      CpuSparseMatrix* sparseInputV =
+          dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* sparseInputSquare =
+          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
+      CpuSparseMatrix* sparseTmpInput =
+          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
+      sparseTmpInput->copyFrom(*sparseInputV);
+
+      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
+
+      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
+      negOnes_->zeroMem();
+      negOnes_->add(-1);
+      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
+    } else {
+      Matrix::resizeOrCreate(
+          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+
+      tmpInput_->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
+      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
+
+      tmpSum_->sumCols(*tmpInput_, -1, 0);
+    }
+
+    latentVectors_->getWGrad()->addRowScale(
+        0, *latentVectors_->getW(), *tmpSumTrans);
+
+    /* Increasing the number of gradient */
+    latentVectors_->getParameterPtr()->incUpdate(callback);
+  }
+
+  /* Calculate the input layers gradient */
+  MatrixPtr inGrad = getInputGrad(0);
+  if (inGrad != NULL) {
+    inGrad->mul(
+        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
+    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum_);
+    inGrad->rowScale(0, *inGrad, *oGrad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..df20a49934d5dd444f127842c8fdb7c77f4ebeb1
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * @brief The Factorization Machine models pairwise (order-2) feature
+ * interactions as inner product of the learned latent vectors corresponding
+ * to each input feature.
+ *
+ * The Factorization Machine can effectively capture feature interactions
+ * especially when the input is sparse. While in principle FM can model higher
+ * order feature interaction, in practice usually only order-2 feature
+ * interactions are considered. The Factorization Machine Layer here only
+ * computes the order-2 interations with the formula:
+ *
+ * \f[
+ *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+ * \f]
+ *
+ * The detailed calculation for forward and backward can be found at this paper:
+ *
+ *     Factorization machines.
+ *
+ * The config file api is factorization_machine.
+ */
+
+class FactorizationMachineLayer : public Layer {
+protected:
+  // The latent vectors, shape: (size, factorSize_)
+  // Each row of the latentVectors_ matrix is the latent vector
+  // corresponding to one input feature dimension
+  std::unique_ptr<Weight> latentVectors_;
+  // The hyperparameter that defines the dimensionality of the factorization
+  size_t factorSize_;
+
+private:
+  // Store the square values of the letent vectors matrix
+  MatrixPtr latentVectorsSquare_;
+  // Store the square values of input matrix
+  MatrixPtr inputSquare_;
+  // The result of input matrix * latent vector matrix that will be used in
+  // both forward and backward step
+  MatrixPtr inputMulFactor_;
+  // Store temporary calculation result
+  MatrixPtr tmpOut_;
+  MatrixPtr tmpSum_;
+  MatrixPtr tmpInput_;
+  // Negative identity matrix
+  MatrixPtr negOnes_;
+
+public:
+  explicit FactorizationMachineLayer(const LayerConfig& config)
+      : Layer(config) {}
+  ~FactorizationMachineLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
index d62a8d846e5b347aa44ce1951c043d5813a5b3ff..236f8096bdb6e024cf3c9c73eba422616a777a23 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -64,49 +64,111 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
                          batchSize,
                          codeLength_,
                          /* trans */ false,
-                         useGpu(deviceId_));
+                         false);
   Matrix::resizeOrCreate(preOutput_.grad,
                          batchSize,
                          codeLength_,
                          /* trans */ false,
-                         useGpu(deviceId_));
-
+                         false);
   IVectorPtr label = getInput(*getLabelLayer()).ids;
-
   preOutput_.value->zeroMem();
 
+  if (useGpu_) {
+    Matrix::resizeOrCreate(cpuOutput_,
+                           output_.value->getHeight(),
+                           output_.value->getWidth(),
+                           /* trans */ false,
+                           false);
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+    cpuOutput_->copyFrom(*output_.value);
+  } else {
+    cpuOutput_ = output_.value;
+    cpuLabel_ = label;
+  }
   /* add the bias-vector */
   if (biases_.get() != NULL) {
-    preOutput_.value->addByBitCode(numClasses_, *label, *biases_->getW());
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_->getW());
+    } else {
+      cpuBias_ = biases_->getW();
+    }
+    preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
   }
   for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
     MatrixPtr input = getInputValue(i);
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuInput_,
+                             input->getHeight(),
+                             input->getWidth(),
+                             /* trans */ false,
+                             false);
+      Matrix::resizeOrCreate(cpuWeight_,
+                             weights_[i]->getW()->getHeight(),
+                             weights_[i]->getW()->getWidth(),
+                             /* trans */ false,
+                             false);
+      cpuInput_->copyFrom(*input);
+      cpuWeight_->copyFrom(*weights_[i]->getW());
+    } else {
+      cpuInput_ = input;
+      cpuWeight_ = weights_[i]->getW();
+    }
     preOutput_.value->mulByBitCode(
-        numClasses_, *label, *weights_[i]->getW(), *input);
+        numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
   }
   // keep consistent with the clipping in the following softrelu
   preOutput_.value->clip(-40.0, 40.0);
   preOutput_.value->sumByBitCode(numClasses_,
-                                 *label,
-                                 *output_.value,
+                                 *cpuLabel_,
+                                 *cpuOutput_,
                                  -1);  // scaleSum
   preOutput_.value->softrelu(*preOutput_.value);
-  MatrixPtr sum =
-      Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_));
+  MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
   preOutput_.value->rowSum(*sum);
-  output_.value->add(*sum);
+  cpuOutput_->add(*sum);
+  if (useGpu_) {
+    output_.value->copyFrom(*cpuOutput_);
+  } else {
+    output_.value = cpuOutput_;
+  }
 }
 
 void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
   IVectorPtr label = getInput(*getLabelLayer()).ids;
+  if (useGpu_) {
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+  } else {
+    cpuLabel_ = label;
+  }
   preOutput_.grad->one();
   preOutput_.grad->softreluDerivative(*preOutput_.value);
-  preOutput_.grad->subByBitCode(numClasses_, *label);
+  preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
 
   if (biases_ && biases_->getWGrad()) {
-    preOutput_.grad->addByBitCodeBackward(
-        numClasses_, *label, *biases_->getWGrad());
-
+    MatrixPtr biases_grad = biases_->getWGrad();
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_grad);
+    } else {
+      cpuBias_ = biases_grad;
+    }
+    preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
+    if (useGpu_) {
+      biases_grad->copyFrom(*cpuBias_);
+    } else {
+      biases_grad = cpuBias_;
+    }
     /* Increasing the number of gradient */
     biases_->getParameterPtr()->incUpdate(callback);
   }
@@ -115,9 +177,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
     /* Calculate the W-gradient for the current layer */
     MatrixPtr input = getInputValue(i);
     if (weights_[i]->getWGrad()) {
+      MatrixPtr weights_grad = weights_[i]->getWGrad();
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInput_,
+                               input->getHeight(),
+                               input->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeightGrad_,
+                               weights_grad->getHeight(),
+                               weights_grad->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInput_->copyFrom(*input);
+        cpuWeightGrad_->copyFrom(*weights_grad);
+      } else {
+        cpuInput_ = input;
+        cpuWeightGrad_ = weights_grad;
+      }
       preOutput_.grad->mulByBitCodeBackwardWeight(
-          numClasses_, *label, *weights_[i]->getWGrad(), *input);
-
+          numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
+      if (useGpu_) {
+        weights_grad->copyFrom(*cpuWeightGrad_);
+      } else {
+        weights_grad = cpuWeightGrad_;
+      }
       /* Increasing the number of gradient */
       weights_[i]->getParameterPtr()->incUpdate(callback);
     }
@@ -125,8 +209,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
     /* Calculate the input layers error */
     MatrixPtr inputGrad = getInputGrad(i);
     if (inputGrad) {
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInputGrad_,
+                               inputGrad->getHeight(),
+                               inputGrad->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeight_,
+                               weights_[i]->getW()->getHeight(),
+                               weights_[i]->getW()->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInputGrad_->copyFrom(*inputGrad);
+        cpuWeight_->copyFrom(*weights_[i]->getW());
+      } else {
+        cpuInputGrad_ = inputGrad;
+        cpuWeight_ = weights_[i]->getW();
+      }
       preOutput_.grad->mulByBitCodeBackwardError(
-          numClasses_, *label, *weights_[i]->getW(), *inputGrad);
+          numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
+      if (useGpu_) {
+        inputGrad->copyFrom(*cpuInputGrad_);
+      } else {
+        inputGrad = cpuInputGrad_;
+      }
     }
   }
 }
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 9afd40b1674680da962d6e51caa56b46279b70de..7f896e61ca26e3e22b99b65b1285384a121f7f02 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -80,6 +80,15 @@ protected:
   int codeLength_;
   /// temporary result of output_
   Argument preOutput_;
+
+  /// The temporary variables in CPU memory.
+  MatrixPtr cpuWeight_;
+  MatrixPtr cpuWeightGrad_;
+  MatrixPtr cpuInput_;
+  MatrixPtr cpuInputGrad_;
+  MatrixPtr cpuBias_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index c295ea19c9ccb3d05c509a41925d2c36efdba8ef..24e6cae8e69557c42ed5d437edce101709ca3983 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -62,11 +62,11 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
 endif()
 
 if(NOT MOBILE_INFERENCE)
-################## test_Evaluator #######################
+    ################## test_Evaluator #######################
     add_unittest(test_Evaluator
         test_Evaluator.cpp)
       
-############### test_RecurrentGradientMachine ###############
+    ############### test_RecurrentGradientMachine ###############
     # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
     # I will fix it.
     add_unittest_without_exec(test_RecurrentGradientMachine
@@ -77,7 +77,7 @@ if(NOT MOBILE_INFERENCE)
                 ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
         WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
       
-############### test_NetworkCompare ###############
+    ############### test_NetworkCompare ###############
     add_unittest_without_exec(test_NetworkCompare
         test_NetworkCompare.cpp)
     if(WITH_GPU)
@@ -89,34 +89,33 @@ if(NOT MOBILE_INFERENCE)
             COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
             WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
     endif()
-endif()
 
+    ################# test_CompareSparse ##################
+    add_unittest_without_exec(test_CompareSparse
+        test_CompareSparse.cpp)
+    if(NOT ON_TRAVIS)
+      add_test(NAME test_CompareSparse
+        COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+              ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+                  ./.set_port.sh -p port -n 6
+                      ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+    endif()
+
+    ################ test_CompareTwoNets ######################
+    add_unittest_without_exec(test_CompareTwoNets
+        test_CompareTwoNets.cpp)
+    add_test(NAME test_CompareTwoNets
+      COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+            ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+            ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endif()
 
+################ test_PyDataProvider2 ######################
 add_unittest_without_exec(test_PyDataProvider2
         test_PyDataProvider2.cpp)
-
 add_test(NAME test_PyDataProvider2
    COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
         WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
 )
-
-################# test_CompareSparse ##################
-add_unittest_without_exec(test_CompareSparse
-    test_CompareSparse.cpp)
-if(NOT ON_TRAVIS)
-  add_test(NAME test_CompareSparse
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
-          ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
-              ./.set_port.sh -p port -n 6
-                  ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-endif()
-
-################ test_CompareTwoNets ######################
-add_unittest_without_exec(test_CompareTwoNets
-    test_CompareTwoNets.cpp)
-add_test(NAME test_CompareTwoNets
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
-        ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index cacf10692942f5eca2f6c498183f4acc00768460..c5359f272b4bed4d4d2483bf19d7ae482b0d33dd 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -681,12 +681,13 @@ TEST(Layer, hsigmoidLayer) {
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
 
-  // Not support GPU now
-  testLayerGrad(config,
-                "hsigmoid",
-                100,
-                /* trans */ false, /* useGpu */
-                false);
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "hsigmoid",
+                  100,
+                  /* trans */ false,
+                  /* useGpu */ useGpu);
+  }
 }
 
 TEST(Layer, multi_cross) {
@@ -2464,6 +2465,25 @@ TEST(Layer, L2DistanceLayer) {
   }
 }
 
+void testFactorizationMachineLayer(InputType type, bool useGpu) {
+  const int FACTOR_SIZE = 10;
+  TestConfig config;
+  config.layerConfig.set_type("factorization_machine");
+  config.layerConfig.set_factor_size(FACTOR_SIZE);
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+  config.inputDefs.push_back({type, "layer_0", 128, 1280});
+  config.layerConfig.add_inputs();
+  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
+}
+
+TEST(Layer, FactorizationMachineLayer) {
+  for (auto useGpu : {false, true}) {
+    testFactorizationMachineLayer(INPUT_DATA, useGpu);
+  }
+  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index bf62229c03bb1d6e2bdf86d8c56a8157938fb832..dc6979cf5a5229fb09866189f28217889d58c2d0 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -260,6 +260,35 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
   os << ";";
 }
 
+void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK_EQ(height_, b.getHeight());
+  CHECK_EQ(width_, b.getWidth());
+  real* A = getValue();
+  real* B = b.getValue();
+  if (b.getValueType() == FLOAT_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = B[j] * c.getElement(i, cCol);
+      }
+    }
+  } else if (b.getValueType() == NO_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = c.getElement(i, cCol);
+      }
+    }
+  }
+}
+
 void CpuSparseMatrix::randomizeUniform() {
   CHECK_LE(elementCnt_, height_ * width_);
   if (valueType_ == FLOAT_VALUE) {
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index aad1348353d558abca72ed0fa5cf943237e3ac78..522b436a2a69179d3f4f17c919d5ba024102db7b 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -239,6 +239,15 @@ public:
               const unsigned int* cols,
               const real* values);
 
+  /**
+   * @brief this_row = b_row * c_row[cCol]
+   *
+   * @param[in]  cCol   the column of matrix c used to scale each row of b
+   * @param[in]  b      CpuSparseMatrix
+   * @param[in]  c      Matrix
+   */
+  void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c);
+
   void randomizeUniform();
 
   void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index a4c4374cf2f8b4b034d05e3a4c2221300a944214..937441b318095eadb9022c1d7578ad8aca2dadc8 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -191,6 +191,7 @@ set(DEPS_OPS
     sum_op
     pool_op
     maxout_op
+    unpool_op
     pool_with_index_op
     conv_op
     conv_transpose_op
@@ -205,8 +206,24 @@ set(DEPS_OPS
     tensor_array_read_write_op
     gru_op
     adagrad_op
-    sgd_op)
+    sgd_op
+    save_op
+    load_op
+    send_op
+    recv_op)
 
+add_subdirectory(detail)
+op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+set_source_files_properties(
+    send_op.cc
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+
+op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+set_source_files_properties(
+    recv_op.cc
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
@@ -219,6 +236,7 @@ op_library(adagrad_op DEPS selected_rows_functor)
 op_library(conv_op DEPS vol2col)
 op_library(pool_op DEPS pooling)
 op_library(maxout_op DEPS maxouting)
+op_library(unpool_op DEPS unpooling)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
 op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
@@ -235,6 +253,10 @@ op_library(conv_transpose_op DEPS vol2col)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
 
+# FIXME(typhoonzero): save/load depends lodtensor serialization functions
+op_library(save_op DEPS lod_tensor)
+op_library(load_op DEPS lod_tensor)
+
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
     op_library(${src})
@@ -242,6 +264,8 @@ endforeach()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
+
+
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
@@ -251,3 +275,4 @@ if(WITH_GPU)
   cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
+cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index f884e6efa917ce3f8554dce0e248f2b29273e3f3..ac97bd83ab7e7838871586cfe5acb832084b6cec 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -62,13 +62,14 @@ class BatchNormOp : public framework::OperatorWithKernel {
     const auto x_dims = ctx->GetInputDim("X");
     const TensorFormat tensor_format =
         StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
+
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "Input X must have 2 to 5 dimensions.");
+
     const int C =
         (tensor_format == TensorFormat::NCHW ? x_dims[1]
                                              : x_dims[x_dims.size() - 1]);
 
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "Input X must have 3 to 5 dimensions.");
-
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
@@ -146,8 +147,8 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     const int N = x_dims[0];
     const int C =
         (tensor_format == TensorFormat::NCHW ? x_dims[1]
@@ -339,8 +340,8 @@ class BatchNormGradKernel<platform::CPUPlace, T>
     // Get the size for each dimension.
     // NCHW [batch_size, in_channels, in_height, in_width]
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     const int N = x_dims[0];
     const int C =
         (tensor_format == TensorFormat::NCHW ? x_dims[1]
diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc
index 726d1ea1b8d7ced93f94bb0e5bb4df9e43b0ac7b..7b2f3187007fa2491afa75de1cde1910c6ce9bb8 100644
--- a/paddle/operators/batch_norm_op.cu.cc
+++ b/paddle/operators/batch_norm_op.cu.cc
@@ -29,14 +29,21 @@ void ExtractNCWHD(const framework::DDim &dims,
                   const TensorFormat &tensor_format, int *N, int *C, int *H,
                   int *W, int *D) {
   *N = dims[0];
-  *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
-  *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
-  *W = dims.size() > 3
-           ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
-           : 1;
-  *D = dims.size() > 4
-           ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
-           : 1;
+  if (dims.size() == 2) {
+    *C = dims[1];
+    *H = 1;
+    *W = 1;
+    *D = 1;
+  } else {
+    *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
+    *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
+    *W = dims.size() > 3
+             ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
+             : 1;
+    *D = dims.size() > 4
+             ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
+             : 1;
+  }
 }
 
 template <typename T>
@@ -56,8 +63,8 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
     // NCHW [batch_size, in_channels, in_height, in_width]
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
 
@@ -180,8 +187,8 @@ class BatchNormGradKernel<platform::GPUPlace, T>
 
     const auto &x_dims = x->dims();
 
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
 
diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc
index a9763d424801cfced5fe4c4718a335a24b81cfdc..3f97dc7ee0a61944a8a57314b5ec7f33df619bf3 100644
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -63,7 +63,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
     cudnnConvolutionDescriptor_t cudnn_conv_desc =
         conv_desc.descriptor<T>(paddings, strides, dilations);
 
-#if CUDNN_VERSION_MIN(7, 0, 0)
+#if CUDNN_VERSION_MIN(7, 0, 1)
     // cudnn 7 can support groups, no need to do it mannually
     // FIXME(typhoonzero): find a better way to disable groups
     // rather than setting it to 1.
@@ -180,7 +180,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
     cudnnConvolutionDescriptor_t cudnn_conv_desc =
         conv_desc.descriptor<T>(paddings, strides, dilations);
 
-#if CUDNN_VERSION_MIN(7, 0, 0)
+#if CUDNN_VERSION_MIN(7, 0, 1)
     // cudnn 7 can support groups, no need to do it mannually
     // FIXME(typhoonzero): find a better way to disable groups
     // rather than setting it to 1.
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index 7a36a9b21aa6a1b415ac5a232e65eda8051c87f8..462e6d9cbcbe61d9911efe8beff4446620e1e932 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -97,7 +97,7 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
       .SetDefault({0, 0});
   AddAttr<int>(
       "groups",
-      "(int default:1), the group size of convolution operator. "
+      "(int default:1), the groups number of the convolution operator. "
       "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
       "when group=2, the first half of the filters is only connected to the "
       "first half of the input channels, while the second half of the filters "
@@ -112,23 +112,29 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
 Convolution Operator.
 
 The convolution operation calculates the output based on the input, filter
-and strides, paddings, groups, dilations parameters. The size of each dimension of the
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
+Input(Input) and Output(Output) are in NCHW format. Where N is batch
 size, C is the number of channels, H is the height of the feature, and W is
-the width of the feature. Parameters(ksize, strides, paddings, dilations) are two elements.
-These two elements represent height and width, respectively.
+the width of the feature.
+Filters(Input) is MCHW format. Where M is the number of output image channels, C is
+the number of input image channels, H is the height of the filter, and W
+is the width of the filter.
+Parameters(strides, paddings, dilations) are two elements. These two elements represent
+height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
 Example:
   Input:
-       Input shape: (N, C_in, H_in, W_in)
-       Filter shape: (C_out, C_in, H_f, W_f)
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
   Output:
-       Output shape: (N, C_out, H_out, W_out)
-  where
-       H_out = (H_in + 2 * paddings[0] - (dilations[0]*(filter_size[0] - 1) + 1)) / strides[0] + 1;
-       W_out = (W_in + 2 * paddings[1] - (dilations[1]*(filter_size[1] - 1) + 1)) / strides[1] + 1;
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
+  Where
+$$
+       H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
+       W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
+$$
 )DOC");
 }
 
@@ -165,7 +171,7 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
       .SetDefault({0, 0, 0});
   AddAttr<int>(
       "groups",
-      "(int default:1), the group size of convolution operator. "
+      "(int default:1), the groups number of the convolution operator. "
       "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
       "when group=2, the first half of the filters is only connected to the "
       "first half of the input channels, while the second half of the filters "
@@ -174,32 +180,37 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
   AddAttr<std::vector<int>>("dilations",
                             "(vector<int> default:{1, 1, 1}), the "
                             "dilations(d_dilation, h_dilation, w_dilation) of "
-                            "convolution operator. Currently, conv3d doesn't "
-                            "support dilation.")
+                            "convolution operator.")
       .SetDefault({1, 1, 1});
 
   AddComment(R"DOC(
 Convolution3D Operator.
 
 The convolution operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
+Input(Input) and output(Output) are in NCDHW format, where N is batch
 size, C is the number of channels,D is the depth of the feature, H is the height of
-the feature, and W is the width of the feature. Parameters(ksize, strides, paddings)
-are three elements. These three elements represent depth, height and width, respectively.
+the feature, and W is the width of the feature.
+Filters(Input) is MCDHW format, where M is the number of output image channels,
+C is the number of input image channels, D is the depth of the filter,
+H is the height of the filter, and W is the width of the filter.
+Parameters(strides, paddings, dilations) are three elements. These three elements
+represent depth, height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
 Example:
   Input:
-       Input shape: (N, C_in, D_in, H_in, W_in)
-       Filter shape: (C_out, C_in, D_f, H_f, W_f)
+       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{out}, C_{in}, D_f, H_f, W_f)$
   Output:
-       Output shape: (N, C_out, D_out, H_out, W_out)
-  where
-       D_out = (D_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
-       H_out = (H_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
-       W_out = (W_in - filter_size[2] + 2 * paddings[2]) / strides[2] + 1;
+       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       D_{out}= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{ strides[0]}+ 1 \\
+       H_{out}= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{ strides[1]}+ 1 \\
+       W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1
+  $$
 )DOC");
 }
 
diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc
index 3e55ef036a7fb976117054574d1347fa943acd55..678b192dea78fc6b4a6b54c4bb09a55dfb8f9c38 100644
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -39,7 +39,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
                  "ConvTransposeOp input dimension and strides dimension should "
                  "be consistent.");
   PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
-                    "ConvTransposeOp paddings dimension and Conv strides "
+                    "ConvTransposeOp paddings dimension and strides "
                     "dimension should be the same.");
   PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
                     "In ConvTransposeOp, The input channel should be the same "
@@ -62,24 +62,25 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
       "The format of input tensor is NCHW. Where N is batch size, C is the "
       "number of input channels, H is the height of the feature, and "
       "W is the width of the feature.");
-  AddInput("Filter",
-           "(Tensor) The filter tensor of convolution transpose operator. "
-           "The format of the filter tensor is CMHW, where C is the number of "
-           "output image channels, M is the number of input image channels, "
-           "H is the height of the filter, and W is the width of the filter. "
-           "We enforce groups number == 1 and padding == 0 in "
-           "the convolution transpose scenario.");
+  AddInput(
+      "Filter",
+      "(Tensor) The filter tensor of convolution transpose operator. "
+      "The format of the filter tensor is MCHW, where M is the number of "
+      "input feature channels, C is the number of "
+      "output feature channels,"
+      "H is the height of the filter, and W is the width of the filter. "
+      "We enforce groups number == 1 in the convolution transpose scenario.");
   AddOutput("Output",
             "(Tensor) The output tensor of convolution transpose operator. "
             "The format of output tensor is also NCHW.");
   AddAttr<std::vector<int>>(
       "strides",
-      "(vector<int> defalut:{1, 1}), the strides(h_stride, w_stride) of "
+      "(vector<int> default:{1, 1}), the strides(h_stride, w_stride) of "
       "convolution transpose operator.")
       .SetDefault({1, 1});
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector<int> defalut:{0, 0}), the paddings(h_pad, w_pad) of convolution "
+      "(vector<int> default:{0, 0}), the paddings(h_pad, w_pad) of convolution "
       "transpose operator.")
       .SetDefault({0, 0});
   AddComment(R"DOC(
@@ -88,21 +89,26 @@ Convolution2D Transpose Operator.
 The convolution transpose operation calculates the output based on the input, filter
 and strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-
-Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
-size, C is the number of channels, H is the height of the feature, and 
-W is the width of the feature. Parameters(ksize, strides, paddings) are two elements.
-These two elements represent height and width, respectively.
+Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the
+number of channels, H is the height of the feature, and W is the width of the feature.
+Filter(Input) is in MCHW format. Where M is the number of input feature channels,
+C is the number of output feature channels, H is the height of the filter,
+and W is the width of the filter.
+Parameters(strides, paddings) are two elements. These two elements represent height
+and width, respectively.
 The input(X) size and output(Out) size may be different.
+
 Example:
   Input:
-       Input shape: (N, C_in, H_in, W_in)
-       Filter shape: (C_in, C_out, H_f, W_f)
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
   Output:
-       Output shape: (N, C_out, H_out, W_out)
-  where
-       H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
-       W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + H_f \\
+       W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + W_f
+  $$
 )DOC");
 }
 
@@ -117,8 +123,9 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
            "W is the width of the feature.");
   AddInput("Filter",
            "(Tensor) The filter tensor of convolution transpose operator."
-           "The format of the filter tensor is CMDHW, where C is the number of "
-           "output image channels, M is the number of input image channels, D "
+           "The format of the filter tensor is MCDHW, where M is the number of "
+           "input feature channels, C is the number of "
+           "output feature channels, D "
            "is the depth of the filter, H is the height of the filter, and "
            "W is the width of the filter."
            "We enforce groups number == 1 and padding == 0 in "
@@ -130,12 +137,12 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
             "the number of channels, D is the depth of the feature, H is the "
             "height of the feature, and W is the width of the feature.");
   AddAttr<std::vector<int>>("strides",
-                            "(vector<int> defalut:{1, 1, 1}), the "
+                            "(vector<int> default:{1, 1, 1}), the "
                             "strides{d_stride, h_stride, w_stride} of "
                             "convolution transpose operator.")
       .SetDefault({1, 1, 1});
   AddAttr<std::vector<int>>("paddings",
-                            "(vector<int> defalut:{0, 0, 0}), paddings(d_pad, "
+                            "(vector<int> default:{0, 0, 0}), paddings(d_pad, "
                             "h_pad, w_pad) of convolution transpose operator.")
       .SetDefault({0, 0, 0});
   AddComment(R"DOC(
@@ -144,23 +151,28 @@ Convolution3D Transpose Operator.
 The convolution transpose operation calculates the output based on the input, filter
 and strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-
-Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
-size, C is the number of channels, D is the depth of the feature, 
-H is the height of the feature, and W is the width of the feature. 
-Parameters(ksize, strides, paddings) are three elements.
-These three elements represent depth, height and width, respectively.
+Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the
+number of channels, D is the depth of the feature, H is the height of the feature,
+and W is the width of the feature.
+Filter(Input) is in MCDHW format. Where M is the number of input feature channels,
+C is the number of output feature channels, D is the depth of the filter,H is the
+height of the filter, and W is the width of the filter.
+Parameters(strides, paddings) are three elements. These three elements represent
+depth, height and width, respectively.
 The input(X) size and output(Out) size may be different.
-Example:
+
+Example:   
   Input:
-       Input shape: (N, C_in, D_in, H_in, W_in)
-       Filter shape: (C_in, C_out, D_f, H_f, W_f)
+       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
   Output:
-       Output shape: (N, C_out, D_out, H_out, W_out)
-  where
-       D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
-       H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
-       W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + filter_size[2];
+       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + D_f \\
+       H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + H_f \\
+       W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + W_f
+  $$
 )DOC");
 }
 
diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h
index 0fc0735788c499c2d520c0cc689e1ce07ba67ce8..1cacb770e6af3ad3c99ab81c5598ffcd228f59b2 100644
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -63,7 +63,6 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
 
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    // TODO(Zhuoyuan): Paddings can be added in future.
     // groups will alway be disabled in conv2dtranspose.
 
     const int batch_size = static_cast<int>(input->dims()[0]);
diff --git a/paddle/operators/detail/CMakeLists.txt b/paddle/operators/detail/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f6bdc63cc2cfae526fe911ee4d989675452d5c5d
--- /dev/null
+++ b/paddle/operators/detail/CMakeLists.txt
@@ -0,0 +1 @@
+grpc_library(sendrecvop_grpc SRCS recv_impl.cc send_impl.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
diff --git a/paddle/operators/detail/recv_impl.cc b/paddle/operators/detail/recv_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89dc5045221156eed7aa9411bc96ad86f91136d2
--- /dev/null
+++ b/paddle/operators/detail/recv_impl.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "send_recv_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+Status SendRecvServerImpl::SendVariable(ServerContext *context,
+                                        const VariableMessage *in_var,
+                                        VariableMessage *out_var) {
+  framework::LoDTensor t;
+  // TODO(typhoonzero): desirealize in_tensor and run pserver network.
+  std::istringstream iss(in_var->serialized());
+  framework::DeserializeFromStream(iss, &t);
+  lodtensor_queue_.Push(std::move(t));
+  // Block util the sub graph is done.
+  t = lodtensor_return_queue_.Pop();
+  std::ostringstream oss;
+  // FIXME(typhoonzero): get context from op.
+  framework::SerializeToStream(oss, t, platform::CPUDeviceContext());
+  std::string *varname = out_var->mutable_varname();
+  *varname = in_var->varname();
+  std::string *serialized = out_var->mutable_serialized();
+  *serialized = oss.str();
+
+  return Status::OK;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/send_impl.cc b/paddle/operators/detail/send_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da1ddf75d2afb85670c5ea0c9884376415f28208
--- /dev/null
+++ b/paddle/operators/detail/send_impl.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "send_recv_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+bool RPCClient::SendVariable(const framework::Scope& scope,
+                             const std::string& inname,
+                             const std::string& outname) {
+  ClientContext context;
+  VariableMessage msg, out_msg;
+  // FIXME(typhoonzero): pass device context to here.
+  auto ctx = platform::CPUDeviceContext();
+  auto* var = scope.FindVar(inname);
+  PADDLE_ENFORCE(var);
+  // TODO(typhoonzero): support SelectedRows
+  PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
+                 "Only support LoDTensor, %s has wrong type", inname);
+  const framework::LoDTensor& tensor = var->Get<framework::LoDTensor>();
+  std::ostringstream oss;
+  framework::SerializeToStream(oss, tensor, ctx);
+  msg.set_varname(inname);
+  msg.set_serialized(oss.str());
+  Status status = stub_->SendVariable(&context, msg, &out_msg);
+  if (!status.ok()) {
+    return false;
+  }
+  std::istringstream iss(out_msg.serialized());
+  framework::LoDTensor ret_tensor;
+  framework::DeserializeFromStream(iss, &ret_tensor);
+  auto* outvar = scope.FindVar(outname);
+  framework::LoDTensor* out_tensor = outvar->GetMutable<framework::LoDTensor>();
+  // FIXME(typhoonzero): do not copy.
+  framework::CopyFrom(ret_tensor, ctx.GetPlace(), ctx, out_tensor);
+  return true;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto
new file mode 100644
index 0000000000000000000000000000000000000000..07ff9d2c621a2dfb51792821a0d3fc398c315835
--- /dev/null
+++ b/paddle/operators/detail/send_recv.proto
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+syntax = "proto3";
+
+package sendrecv;
+
+service SendRecvService {
+  // For parameter server round-robin like hashing, do not split tensors.
+  // Send and recv only one tensor
+  rpc SendVariable(VariableMessage) returns (VariableMessage) {}
+}
+
+// VariableMessage is serialized paddle variable message.
+// It can be:
+// Tensor
+// LoDTensor
+// SelectedRows
+message VariableMessage {
+  string varname = 1;
+  bytes serialized = 2;
+}
+
+message VoidMessage {}
diff --git a/paddle/operators/detail/send_recv_impl.h b/paddle/operators/detail/send_recv_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9a5340a8636db7b5d6ec7b21368632d3916b4aa
--- /dev/null
+++ b/paddle/operators/detail/send_recv_impl.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+// #include <grpc++/channel.h>
+// #include <grpc++/client_context.h>
+// #include <grpc++/create_channel.h>
+// #include <grpc++/security/credentials.h>
+#include "paddle/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/operators/detail/send_recv.pb.h"
+
+#include <grpc++/grpc++.h>
+
+using grpc::Channel;
+using grpc::Server;
+using grpc::ServerContext;
+using grpc::ServerReader;
+using grpc::ServerBuilder;
+
+using grpc::ClientContext;
+using grpc::ClientReader;
+using grpc::ClientReaderWriter;
+using grpc::ClientWriter;
+using grpc::Status;
+using sendrecv::SendRecvService;
+using sendrecv::VariableMessage;
+using sendrecv::VoidMessage;
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class SendRecvServerImpl final : public SendRecvService::Service {
+ public:
+  explicit SendRecvServerImpl() {}
+
+  Status SendVariable(ServerContext *context, const VariableMessage *in_var,
+                      VariableMessage *out_var) override;
+
+  const framework::LoDTensor Get() { return this->lodtensor_queue_.Pop(); }
+
+  void Push(const framework::LoDTensor &tensor) {
+    this->lodtensor_return_queue_.Push(tensor);
+  }
+
+ private:
+  SimpleBlockQueue<framework::LoDTensor> lodtensor_queue_;
+  SimpleBlockQueue<framework::LoDTensor> lodtensor_return_queue_;
+  SimpleBlockQueue<framework::SelectedRows> selected_rows_queue_;
+  SimpleBlockQueue<framework::SelectedRows> selected_rows_return_queue_;
+};
+
+// RPCClient is a class to send tensors to pserver sub-network
+// using different hashing methods.
+class RPCClient {
+ public:
+  RPCClient(std::shared_ptr<Channel> channel)
+      : stub_(SendRecvService::NewStub(channel)) {}
+
+  bool SendVariable(const framework::Scope &scope, const std::string &inname,
+                    const std::string &outname);
+
+ private:
+  std::unique_ptr<SendRecvService::Stub> stub_;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/simple_block_queue.h b/paddle/operators/detail/simple_block_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..44899217579532af2c1d2e6074ec0e08231e7b86
--- /dev/null
+++ b/paddle/operators/detail/simple_block_queue.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+template <typename T>
+class SimpleBlockQueue {
+ private:
+  std::mutex mutex_;
+  std::condition_variable condition_;
+  std::deque<T> queue_;
+
+ public:
+  void Push(T const& value) {
+    {
+      std::unique_lock<std::mutex> lock(this->mutex_);
+      queue_.push_front(value);
+    }
+    this->condition_.notify_one();
+  }
+
+  T Pop() {
+    std::unique_lock<std::mutex> lock(this->mutex_);
+    this->condition_.wait(lock, [=] { return !this->queue_.empty(); });
+    T rc(std::move(this->queue_.back()));
+    this->queue_.pop_back();
+    return rc;
+  }
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h
index 1b18368e0e16365682520b62a7f6adab0cbb527f..564489d3a98b59e3e527be5613a73d23d6dbbf31 100644
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@@ -71,8 +71,8 @@ class GRUKernel : public framework::OpKernel<T> {
 
     int frame_size = hidden_dims[1];
     math::hl_gru_value<T> gru_value;
-    gru_value.gateWeight = const_cast<T*>(weight_data);
-    gru_value.stateWeight =
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
     Tensor ordered_h0;
     const size_t* order = batch_gate->lod()[2].data();
@@ -82,9 +82,9 @@ class GRUKernel : public framework::OpKernel<T> {
       // to reorder.
       ReorderInitState<Place, T>(context.device_context(), *h0, order,
                                  &ordered_h0, true);
-      gru_value.prevOutValue = ordered_h0.data<T>();
+      gru_value.prev_out_value = ordered_h0.data<T>();
     } else {
-      gru_value.prevOutValue = nullptr;
+      gru_value.prev_out_value = nullptr;
     }
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
@@ -96,14 +96,14 @@ class GRUKernel : public framework::OpKernel<T> {
       Tensor gate_t = batch_gate->Slice(bstart, bend);
       Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
       Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-      gru_value.outputValue = hidden_t.data<T>();
-      gru_value.gateValue = gate_t.data<T>();
-      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
+      gru_value.output_value = hidden_t.data<T>();
+      gru_value.gate_value = gate_t.data<T>();
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
       math::GRUUnitFunctor<Place, T>::compute(
           dev_ctx, gru_value, frame_size, cur_batch_size,
           math::ActiveType(context.Attr<std::string>("activation")),
           math::ActiveType(context.Attr<std::string>("gate_activation")));
-      gru_value.prevOutValue = gru_value.outputValue;
+      gru_value.prev_out_value = gru_value.output_value;
     }
 
     math::Batch2LoDTensorFunctor<Place, T> to_seq;
@@ -169,20 +169,20 @@ class GRUGradKernel : public framework::OpKernel<T> {
     to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
 
     math::hl_gru_value<T> gru_value;
-    gru_value.gateWeight = const_cast<T*>(weight_data);
-    gru_value.stateWeight =
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
 
     math::hl_gru_grad<T> gru_grad;
     if (weight_grad) {
-      gru_grad.gateWeightGrad =
+      gru_grad.gate_weight_grad =
           weight_grad->mutable_data<T>(context.GetPlace());
       zero(dev_ctx, weight_grad, static_cast<T>(0.0));
-      gru_grad.stateWeightGrad =
+      gru_grad.state_weight_grad =
           weight_grad->data<T>() + 2 * frame_size * frame_size;
     } else {
-      gru_grad.gateWeightGrad = nullptr;
-      gru_grad.stateWeightGrad = nullptr;
+      gru_grad.gate_weight_grad = nullptr;
+      gru_grad.state_weight_grad = nullptr;
     }
 
     auto batch_starts = batch_hidden_grad.lod()[0];
@@ -193,27 +193,27 @@ class GRUGradKernel : public framework::OpKernel<T> {
       int cur_batch_size = bend - bstart;
 
       Tensor gate_t = batch_gate->Slice(bstart, bend);
-      gru_value.gateValue = gate_t.data<T>();
+      gru_value.gate_value = gate_t.data<T>();
       Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
-      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
 
       Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
-      gru_grad.outputGrad = hidden_grad_t.data<T>();
+      gru_grad.output_grad = hidden_grad_t.data<T>();
       Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
-      gru_grad.gateGrad = gate_grad_t.data<T>();
+      gru_grad.gate_grad = gate_grad_t.data<T>();
       Tensor reset_hidden_prev_grad_t =
           batch_reset_hidden_prev_grad.Slice(bstart, bend);
-      gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data<T>();
+      gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data<T>();
       if (n == 0) {
-        gru_value.prevOutValue = h0 ? ordered_h0.data<T>() : nullptr;
-        gru_grad.prevOutGrad =
+        gru_value.prev_out_value = h0 ? ordered_h0.data<T>() : nullptr;
+        gru_grad.prev_out_grad =
             h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
       } else {
         int bstart_pre = static_cast<int>(batch_starts[n - 1]);
         Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
-        gru_value.prevOutValue = hidden_prev_t.data<T>();
+        gru_value.prev_out_value = hidden_prev_t.data<T>();
         Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
-        gru_grad.prevOutGrad = hidden_prev_grad_t.data<T>();
+        gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
       }
 
       math::GRUUnitGradFunctor<Place, T>::compute(
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
index b0838eed1611c1d51e57fc2300606f753982dc89..4e58b84430f2a8697bbbc1acf971fd063120f563 100644
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -38,61 +38,7 @@ class LoadOp : public framework::OperatorBase {
                    out_var_name);
 
     auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-
-    uint32_t version;
-    fin.read(reinterpret_cast<char *>(&version), sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-    framework::TensorDesc desc;
-    {  // int32_t size
-       // proto buffer
-      int32_t size;
-      fin.read(reinterpret_cast<char *>(&size), sizeof(size));
-      std::unique_ptr<char[]> buf(new char[size]);
-      fin.read(reinterpret_cast<char *>(buf.get()), size);
-      PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
-                     "Cannot parse tensor desc");
-    }
-    {  // read tensor
-      std::vector<int64_t> dims;
-      dims.reserve(static_cast<size_t>(desc.dims().size()));
-      std::copy(desc.dims().begin(), desc.dims().end(),
-                std::back_inserter(dims));
-      tensor->Resize(framework::make_ddim(dims));
-
-      void *buf;
-      platform::Place cpu = platform::CPUPlace();
-      switch (desc.data_type()) {
-        case framework::FP32:
-          buf = tensor->mutable_data<float>(cpu);
-          break;
-        case framework::FP64:
-          buf = tensor->mutable_data<double>(cpu);
-          break;
-        case framework::INT32:
-          buf = tensor->mutable_data<int>(cpu);
-          break;
-        case framework::INT64:
-          buf = tensor->mutable_data<int64_t>(cpu);
-          break;
-        default:
-          PADDLE_THROW("DataType %d not supported", desc.data_type());
-      }
-      fin.read(static_cast<char *>(buf), tensor->memory_size());
-    }
-    {  // read lod
-      uint64_t lod_level;
-      fin.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-      auto &lod = *tensor->mutable_lod();
-      lod.resize(lod_level);
-      for (uint64_t i = 0; i < lod_level; ++i) {
-        uint64_t size;
-        fin.read(reinterpret_cast<char *>(&size), sizeof(size));
-        std::vector<size_t> tmp(size / sizeof(size_t));
-        fin.read(reinterpret_cast<char *>(tmp.data()),
-                 static_cast<std::streamsize>(size));
-        lod[i] = tmp;
-      }
-    }
+    framework::DeserializeFromStream(fin, tensor);
 
     auto place = dev_ctx.GetPlace();
     if (platform::is_gpu_place(place)) {
diff --git a/paddle/operators/log_loss_op.cc b/paddle/operators/log_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..257e5c8a49e935dcbdc33e5060118ef1804fa8d7
--- /dev/null
+++ b/paddle/operators/log_loss_op.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/log_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LogLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) must be initialized.");
+
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+
+    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
+                      "The rank of Input(Predicted) must be 2 and the shape is "
+                      "[batch_size, 1].");
+    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
+                      "Each row of Input(Predicted) contains a real value, "
+                      "so the 2nd dimension of Input(X) must be 1.");
+
+    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
+    ctx->ShareLoD("Predicted", "Loss");
+  }
+};
+
+template <typename AttrType>
+class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LogLossOpMaker(framework::OpProto* proto,
+                 framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Predicted",
+             "The input value (Predicted) of Log loss op."
+             "Predicted is a 2-D tensor with shape [batch_size, 1].");
+    AddInput("Labels",
+             "The target value (Labels) of Log loss op."
+             "Labels is a 2-D tensor with shape [batch_size, 1].");
+    AddOutput("Loss",
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the log loss.");
+    AddAttr<AttrType>("epsilon", "Epsilon in log loss.");
+    AddComment(R"DOC(
+LogLoss Operator.
+
+Log loss is a loss function used for binary classification. Log Loss quantifies
+the accuracy of a classifier by penalising false classifications. Minimising the
+Log Loss is equivalent to maximising the accuracy of the classifier. We define
+Predicted as the values predicted by our model and Labels as the target ground
+truth value. Log loss can evaluate how close the predicted values are to the
+target. The shapes of Predicted and Labels are both [batch_size, 1].
+The equation is:
+
+$$
+Loss = - Labels * log(Predicted + \epsilon) -
+        (1 - Labels) * log(1 - Predicted + \epsilon)
+$$
+
+)DOC");
+  }
+};
+
+class LogLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")),
+                   "Output(Predicted@GRAD) should not be null.");
+
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
+    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
+
+    auto pred_grad_name = framework::GradVarName("Predicted");
+    ctx->SetOutputDim(pred_grad_name, pred_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
+            ops::LogLossGradOp);
+REGISTER_OP_CPU_KERNEL(log_loss,
+                       ops::LogLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    log_loss_grad, ops::LogLossGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/log_loss_op.cu b/paddle/operators/log_loss_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6c189ef3412d7a56205502c7913e93218a03b929
--- /dev/null
+++ b/paddle/operators/log_loss_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/log_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(log_loss,
+                       ops::LogLossKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    log_loss_grad, ops::LogLossGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/log_loss_op.h b/paddle/operators/log_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..73404fce9157fa750a51451fa93646bc4059481a
--- /dev/null
+++ b/paddle/operators/log_loss_op.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T, typename AttrType = T>
+class LogLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* loss_out = ctx.Output<Tensor>("Loss");
+
+    loss_out->mutable_data<T>(ctx.GetPlace());
+
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+
+    auto loss = EigenVector<T>::Flatten(*loss_out);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    loss.device(place) = (-(label * (prediction + epsilon).log()) -
+                          ((static_cast<T>(1) - label) *
+                           (static_cast<T>(1) - prediction + epsilon).log()));
+  }
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class LogLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+
+    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
+
+    auto dl = EigenVector<T>::Flatten(*dloss);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    if (dpred) {
+      dpred->mutable_data<T>(ctx.GetPlace());
+      auto dx = framework::EigenVector<T>::Flatten(*dpred);
+      dx.device(place) = dl * (-(label / (prediction + epsilon)) +
+                               ((static_cast<T>(1) - label) /
+                                (static_cast<T>(1) - prediction + epsilon)));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 3017f133afc5d4dcd484c78b44591a876ab4d667..bf47879f772a3013bd7ce78c6f8a6aefe65298f9 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -13,8 +13,9 @@ if(WITH_GPU)
     nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
     nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
-    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
     nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
+    nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
+    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
     cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
@@ -26,8 +27,9 @@ else()
     cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
     cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
     cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
-    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
     cc_library(maxouting SRCS maxouting.cc DEPS device_context)
+    cc_library(unpooling SRCS unpooling.cc DEPS device_context)
+    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
 endif()
 
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
diff --git a/paddle/operators/math/detail/gru_cpu_kernel.h b/paddle/operators/math/detail/gru_cpu_kernel.h
index 51af140cf4d5e6581765bea00033fa53d383230d..4c67dec9cbeb48f400f79f5ed7ba3c939fa2540c 100644
--- a/paddle/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/operators/math/detail/gru_cpu_kernel.h
@@ -25,393 +25,397 @@ namespace detail {
 #ifndef __NVCC__
 
 template <class OpResetOutput, typename T>
-void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
-                                       T *gateValue, T *resetOutputValue,
-                                       T *prevOutputValue, int frameSize,
+void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                       T *gate_value, T *reset_output_value,
+                                       T *prev_output_value, int frame_size,
                                        activation_mode_t active_gate) {
-  T rValueUpdateGate;
-  T rValueResetGate;
-  T rValueResetOutput;
-  T rPrevOut = 0;
-  T *updateGate = gateValue;
-  T *resetGate = gateValue + frameSize;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueResetGate = resetGate[i];
-    if (prevOutputValue) {
-      rPrevOut = prevOutputValue[i];
+  T r_value_update_gate;
+  T r_value_reset_gate;
+  T r_value_reset_output;
+  T r_prev_out = 0;
+  T *update_gate = gate_value;
+  T *reset_gate = gate_value + frame_size;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_reset_gate = reset_gate[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
     }
 
-    opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
-                  rValueResetOutput, active_gate);
+    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                    r_value_reset_output, active_gate);
 
-    updateGate[i] = rValueUpdateGate;
-    resetGate[i] = rValueResetGate;
-    resetOutputValue[i] = rValueResetOutput;
+    update_gate[i] = r_value_update_gate;
+    reset_gate[i] = r_value_reset_gate;
+    reset_output_value[i] = r_value_reset_output;
   }
 }
 
 template <class OpFinalOutput, typename T>
-void hl_naive_gru_forward_final_output(OpFinalOutput opFinalOutput,
-                                       T *gateValue, T *prevOutputValue,
-                                       T *outputValue, int frameSize,
+void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
+                                       T *gate_value, T *prev_output_value,
+                                       T *output_value, int frame_size,
                                        activation_mode_t active_node) {
-  T rValueUpdateGate;
-  T rValueFrameState;
-  T rPrevOut = 0;
-  T rOutput;
-  T *updateGate = gateValue;
-  T *frameState = gateValue + frameSize * 2;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueFrameState = frameState[i];
-    if (prevOutputValue) {
-      rPrevOut = prevOutputValue[i];
+  T r_value_update_gate;
+  T r_value_frame_state;
+  T r_prev_out = 0;
+  T r_output;
+  T *update_gate = gate_value;
+  T *frame_state = gate_value + frame_size * 2;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_frame_state = frame_state[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
     }
 
-    opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
-                  active_node);
+    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                    r_output, active_node);
 
-    frameState[i] = rValueFrameState;
-    outputValue[i] = rOutput;
+    frame_state[i] = r_value_frame_state;
+    output_value[i] = r_output;
   }
 }
 
 template <class OpResetOutput, typename T>
-void hl_avx_gru_forward_reset_output(OpResetOutput opResetOutput, T *gateValue,
-                                     T *resetOutputValue, T *prevOutputValue,
-                                     int frameSize,
+void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                     T *gate_value, T *reset_output_value,
+                                     T *prev_output_value, int frame_size,
                                      activation_mode_t active_gate) {
 #ifdef __AVX__
-  __m256 rValueUpdateGate;
-  __m256 rValueResetGate;
-  __m256 rValueResetOutput;
-  __m256 rPrevOut = _mm256_set1_ps(0.0f);
-  __m256 *updateGate = (__m256 *)gateValue;
-  __m256 *resetGate = (__m256 *)(gateValue + frameSize);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueResetGate = resetGate[i];
-    if (prevOutputValue) {
-      rPrevOut = ((__m256 *)prevOutputValue)[i];
+  __m256 r_value_update_gate;
+  __m256 r_value_reset_gate;
+  __m256 r_value_reset_output;
+  __m256 r_prev_out = _mm256_set1_ps(0.0f);
+  __m256 *update_gate = (__m256 *)gate_value;
+  __m256 *reset_gate = (__m256 *)(gate_value + frame_size);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_reset_gate = reset_gate[i];
+    if (prev_output_value) {
+      r_prev_out = ((__m256 *)prev_output_value)[i];
     }
 
-    opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
-                  rValueResetOutput, active_gate);
+    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                    r_value_reset_output, active_gate);
 
-    updateGate[i] = rValueUpdateGate;
-    resetGate[i] = rValueResetGate;
-    ((__m256 *)resetOutputValue)[i] = rValueResetOutput;
+    update_gate[i] = r_value_update_gate;
+    reset_gate[i] = r_value_reset_gate;
+    ((__m256 *)reset_output_value)[i] = r_value_reset_output;
   }
 #endif
 }
 
 template <class OpFinalOutput, typename T>
-void hl_avx_gru_forward_final_output(OpFinalOutput opFinalOutput, T *gateValue,
-                                     T *prevOutputValue, T *outputValue,
-                                     int frameSize,
+void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
+                                     T *gate_value, T *prev_output_value,
+                                     T *output_value, int frame_size,
                                      activation_mode_t active_node) {
 #ifdef __AVX__
-  __m256 rValueUpdateGate;
-  __m256 rValueFrameState;
-  __m256 rPrevOut = _mm256_set1_ps(0.0f);
-  __m256 rOutput;
-  __m256 *updateGate = (__m256 *)gateValue;
-  __m256 *frameState = (__m256 *)(gateValue + frameSize * 2);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueFrameState = frameState[i];
-    if (prevOutputValue) {
-      rPrevOut = ((__m256 *)prevOutputValue)[i];
+  __m256 r_value_update_gate;
+  __m256 r_value_frame_state;
+  __m256 r_prev_out = _mm256_set1_ps(0.0f);
+  __m256 r_output;
+  __m256 *update_gate = (__m256 *)gate_value;
+  __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_frame_state = frame_state[i];
+    if (prev_output_value) {
+      r_prev_out = ((__m256 *)prev_output_value)[i];
     }
 
-    opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
-                  active_node);
+    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                    r_output, active_node);
 
-    frameState[i] = rValueFrameState;
-    ((__m256 *)outputValue)[i] = rOutput;
+    frame_state[i] = r_value_frame_state;
+    ((__m256 *)output_value)[i] = r_output;
   }
 #endif
 }
 
 template <class OpResetOutput, typename T>
-inline void forward_reset_output(OpResetOutput opResetOutput,
-                                 hl_gru_value<T> value, int frameSize,
-                                 int batchSize, activation_mode_t active_gate) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpResetOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+inline void forward_reset_output(OpResetOutput op_reset_output,
+                                 hl_gru_value<T> value, int frame_size,
+                                 int batch_size,
+                                 activation_mode_t active_gate) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
       hl_avx_gru_forward_reset_output(
-          opResetOutput, value.gateValue, value.resetOutputValue,
-          value.prevOutValue, frameSize, active_gate);
+          op_reset_output, value.gate_value, value.reset_output_value,
+          value.prev_out_value, frame_size, active_gate);
     } else {
       hl_naive_gru_forward_reset_output(
-          opResetOutput, value.gateValue, value.resetOutputValue,
-          value.prevOutValue, frameSize, active_gate);
+          op_reset_output, value.gate_value, value.reset_output_value,
+          value.prev_out_value, frame_size, active_gate);
     }
 
-    value.gateValue += frameSize * 3;
-    value.resetOutputValue += frameSize;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
+    value.gate_value += frame_size * 3;
+    value.reset_output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
     }
   }
 }
 
 template <class OpFinalOutput, typename T>
-inline void forward_final_output(OpFinalOutput opFinalOutput,
-                                 hl_gru_value<T> value, int frameSize,
-                                 int batchSize, activation_mode_t active_node) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpFinalOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_forward_final_output(opFinalOutput, value.gateValue,
-                                      value.prevOutValue, value.outputValue,
-                                      frameSize, active_node);
+inline void forward_final_output(OpFinalOutput op_final_output,
+                                 hl_gru_value<T> value, int frame_size,
+                                 int batch_size,
+                                 activation_mode_t active_node) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
+                                      value.prev_out_value, value.output_value,
+                                      frame_size, active_node);
     } else {
-      hl_naive_gru_forward_final_output(opFinalOutput, value.gateValue,
-                                        value.prevOutValue, value.outputValue,
-                                        frameSize, active_node);
+      hl_naive_gru_forward_final_output(
+          op_final_output, value.gate_value, value.prev_out_value,
+          value.output_value, frame_size, active_node);
     }
 
-    value.gateValue += frameSize * 3;
-    value.outputValue += frameSize;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
+    value.gate_value += frame_size * 3;
+    value.output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
     }
   }
 }
 
 template <class OpStateGrad, typename T>
-void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
-                                      T *gateGrad, T *prevOutValue,
-                                      T *prevOutGrad, T *outputGrad,
-                                      int frameSize,
+void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
+                                      T *gate_grad, T *prev_out_value,
+                                      T *prev_out_grad, T *output_grad,
+                                      int frame_size,
                                       activation_mode_t active_node) {
-  T rUpdateGateValue;
-  T rUpdateGateGrad;
-  T rFrameStateValue;
-  T rFrameStateGrad;
-  T rOutGrad;
-  T rPrevOutValue = 0;
-  T rPrevOutGrad = 0;
-  T *updateGateValue = gateValue;
-  T *updateGateGrad = gateGrad;
-  T *frameStateValue = gateValue + frameSize * 2;
-  T *frameStateGrad = gateGrad + frameSize * 2;
-
-  for (int i = 0; i < frameSize; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rFrameStateValue = frameStateValue[i];
-    rOutGrad = outputGrad[i];
-    if (prevOutValue) {
-      rPrevOutValue = prevOutValue[i];
+  T r_update_gate_value;
+  T r_update_gate_grad;
+  T r_frame_state_value;
+  T r_frame_state_grad;
+  T r_out_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T *update_gate_value = gate_value;
+  T *update_gate_grad = gate_grad;
+  T *frame_state_value = gate_value + frame_size * 2;
+  T *frame_state_grad = gate_grad + frame_size * 2;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_frame_state_value = frame_state_value[i];
+    r_out_grad = output_grad[i];
+    if (prev_out_value) {
+      r_prev_out_value = prev_out_value[i];
     }
-    if (prevOutGrad) {
-      rPrevOutGrad = prevOutGrad[i];
+    if (prev_out_grad) {
+      r_prev_out_grad = prev_out_grad[i];
     }
 
-    opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
-                rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
-                active_node);
+    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                  r_out_grad, active_node);
 
-    updateGateGrad[i] = rUpdateGateGrad;
-    frameStateGrad[i] = rFrameStateGrad;
-    if (prevOutGrad) {
-      prevOutGrad[i] = rPrevOutGrad;
+    update_gate_grad[i] = r_update_gate_grad;
+    frame_state_grad[i] = r_frame_state_grad;
+    if (prev_out_grad) {
+      prev_out_grad[i] = r_prev_out_grad;
     }
   }
 }
 
 template <class OpResetGrad, typename T>
-void hl_naive_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
-                                      T *gateGrad, T *prevOutValue,
-                                      T *prevOutGrad, T *resetOutputGrad,
-                                      int frameSize,
+void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
+                                      T *gate_grad, T *prev_out_value,
+                                      T *prev_out_grad, T *reset_output_grad,
+                                      int frame_size,
                                       activation_mode_t active_gate) {
-  T rUpdateGateValue;
-  T rUpdateGateGrad;
-  T rResetGateValue;
-  T rResetGateGrad;
-  T rResetOutputGrad = 0;
-  T rPrevOutValue = 0;
-  T rPrevOutGrad = 0;
-  T *updateGateValue = gateValue;
-  T *updateGateGrad = gateGrad;
-  T *resetGateValue = gateValue + frameSize;
-  T *resetGateGrad = gateGrad + frameSize;
-
-  for (int i = 0; i < frameSize; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rUpdateGateGrad = updateGateGrad[i];
-    rResetGateValue = resetGateValue[i];
-
-    if (prevOutValue && prevOutGrad) {
-      rResetOutputGrad = resetOutputGrad[i];
+  T r_update_gate_value;
+  T r_update_gate_grad;
+  T r_reset_gate_value;
+  T r_reset_gate_grad;
+  T r_reset_output_grad = 0;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T *update_gate_value = gate_value;
+  T *update_gate_grad = gate_grad;
+  T *reset_gate_value = gate_value + frame_size;
+  T *reset_gate_grad = gate_grad + frame_size;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_update_gate_grad = update_gate_grad[i];
+    r_reset_gate_value = reset_gate_value[i];
+
+    if (prev_out_value && prev_out_grad) {
+      r_reset_output_grad = reset_output_grad[i];
     }
-    if (prevOutValue) {
-      rPrevOutValue = prevOutValue[i];
+    if (prev_out_value) {
+      r_prev_out_value = prev_out_value[i];
     }
-    if (prevOutGrad) {
-      rPrevOutGrad = prevOutGrad[i];
+    if (prev_out_grad) {
+      r_prev_out_grad = prev_out_grad[i];
     }
 
-    opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
-                rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
-                active_gate);
+    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                  r_reset_output_grad, active_gate);
 
-    updateGateGrad[i] = rUpdateGateGrad;
-    resetGateGrad[i] = rResetGateGrad;
-    if (prevOutGrad) {
-      prevOutGrad[i] = rPrevOutGrad;
+    update_gate_grad[i] = r_update_gate_grad;
+    reset_gate_grad[i] = r_reset_gate_grad;
+    if (prev_out_grad) {
+      prev_out_grad[i] = r_prev_out_grad;
     }
   }
 }
 
 template <class OpStateGrad, typename T>
-void hl_avx_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
-                                    T *gateGrad, T *prevOutValue,
-                                    T *prevOutGrad, T *outputGrad,
-                                    int frameSize,
+void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
+                                    T *gate_grad, T *prev_out_value,
+                                    T *prev_out_grad, T *output_grad,
+                                    int frame_size,
                                     activation_mode_t active_node) {
 #ifdef __AVX__
-  __m256 rUpdateGateValue;
-  __m256 rUpdateGateGrad;
-  __m256 rFrameStateValue;
-  __m256 rFrameStateGrad;
-  __m256 rOutGrad;
-  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
-  __m256 *updateGateValue = (__m256 *)gateValue;
-  __m256 *updateGateGrad = (__m256 *)gateGrad;
-  __m256 *frameStateValue = (__m256 *)(gateValue + frameSize * 2);
-  __m256 *frameStateGrad = (__m256 *)(gateGrad + frameSize * 2);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rFrameStateValue = frameStateValue[i];
-    rOutGrad = ((__m256 *)outputGrad)[i];
-    if (prevOutValue) {
-      rPrevOutValue = ((__m256 *)prevOutValue)[i];
+  __m256 r_update_gate_value;
+  __m256 r_update_gate_grad;
+  __m256 r_frame_state_value;
+  __m256 r_frame_state_grad;
+  __m256 r_out_grad;
+  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
+  __m256 *update_gate_value = (__m256 *)gate_value;
+  __m256 *update_gate_grad = (__m256 *)gate_grad;
+  __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2);
+  __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_frame_state_value = frame_state_value[i];
+    r_out_grad = ((__m256 *)output_grad)[i];
+    if (prev_out_value) {
+      r_prev_out_value = ((__m256 *)prev_out_value)[i];
     }
-    if (prevOutGrad) {
-      rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+    if (prev_out_grad) {
+      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
     }
 
-    opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
-                rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
-                active_node);
+    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                  r_out_grad, active_node);
 
-    updateGateGrad[i] = rUpdateGateGrad;
-    frameStateGrad[i] = rFrameStateGrad;
-    if (prevOutGrad) {
-      ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+    update_gate_grad[i] = r_update_gate_grad;
+    frame_state_grad[i] = r_frame_state_grad;
+    if (prev_out_grad) {
+      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
     }
   }
 #endif
 }
 
 template <class OpResetGrad, typename T>
-void hl_avx_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
-                                    T *gateGrad, T *prevOutValue,
-                                    T *prevOutGrad, T *resetOutputGrad,
-                                    int frameSize,
+void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
+                                    T *gate_grad, T *prev_out_value,
+                                    T *prev_out_grad, T *reset_output_grad,
+                                    int frame_size,
                                     activation_mode_t active_gate) {
 #ifdef __AVX__
-  __m256 rUpdateGateValue;
-  __m256 rUpdateGateGrad;
-  __m256 rResetGateValue;
-  __m256 rResetGateGrad;
-  __m256 rResetOutputGrad = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
-  __m256 *updateGateValue = (__m256 *)gateValue;
-  __m256 *updateGateGrad = (__m256 *)gateGrad;
-  __m256 *resetGateValue = (__m256 *)(gateValue + frameSize);
-  __m256 *resetGateGrad = (__m256 *)(gateGrad + frameSize);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rUpdateGateGrad = updateGateGrad[i];
-    rResetGateValue = resetGateValue[i];
-
-    if (prevOutValue && prevOutGrad) {
-      rResetOutputGrad = ((__m256 *)resetOutputGrad)[i];
+  __m256 r_update_gate_value;
+  __m256 r_update_gate_grad;
+  __m256 r_reset_gate_value;
+  __m256 r_reset_gate_grad;
+  __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
+  __m256 *update_gate_value = (__m256 *)gate_value;
+  __m256 *update_gate_grad = (__m256 *)gate_grad;
+  __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size);
+  __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_update_gate_grad = update_gate_grad[i];
+    r_reset_gate_value = reset_gate_value[i];
+
+    if (prev_out_value && prev_out_grad) {
+      r_reset_output_grad = ((__m256 *)reset_output_grad)[i];
     }
-    if (prevOutValue) {
-      rPrevOutValue = ((__m256 *)prevOutValue)[i];
+    if (prev_out_value) {
+      r_prev_out_value = ((__m256 *)prev_out_value)[i];
     }
-    if (prevOutGrad) {
-      rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+    if (prev_out_grad) {
+      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
     }
 
-    opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
-                rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
-                active_gate);
+    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                  r_reset_output_grad, active_gate);
 
-    updateGateGrad[i] = rUpdateGateGrad;
-    resetGateGrad[i] = rResetGateGrad;
-    if (prevOutGrad) {
-      ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+    update_gate_grad[i] = r_update_gate_grad;
+    reset_gate_grad[i] = r_reset_gate_grad;
+    if (prev_out_grad) {
+      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
     }
   }
 #endif
 }
 
 template <class OpStateGrad, typename T>
-inline void backward_state_grad(OpStateGrad opStateGrad, hl_gru_value<T> value,
-                                hl_gru_grad<T> grad, int frameSize,
-                                int batchSize, activation_mode_t active_node) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpStateGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+inline void backward_state_grad(OpStateGrad op_state_grad,
+                                hl_gru_value<T> value, hl_gru_grad<T> grad,
+                                int frame_size, int batch_size,
+                                activation_mode_t active_node) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
       hl_avx_gru_backward_state_grad(
-          opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
-          grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
     } else {
       hl_naive_gru_backward_state_grad(
-          opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
-          grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
     }
 
-    value.gateValue += frameSize * 3;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
+    value.gate_value += frame_size * 3;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
     }
 
-    grad.gateGrad += frameSize * 3;
-    grad.outputGrad += frameSize;
-    if (grad.prevOutGrad) {
-      grad.prevOutGrad += frameSize;
+    grad.gate_grad += frame_size * 3;
+    grad.output_grad += frame_size;
+    if (grad.prev_out_grad) {
+      grad.prev_out_grad += frame_size;
     }
   }
 }
 
 template <class OpResetGrad, typename T>
-inline void backward_reset_grad(OpResetGrad opResetGrad, hl_gru_value<T> value,
-                                hl_gru_grad<T> grad, int frameSize,
-                                int batchSize, activation_mode_t active_gate) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpResetGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+inline void backward_reset_grad(OpResetGrad op_reset_grad,
+                                hl_gru_value<T> value, hl_gru_grad<T> grad,
+                                int frame_size, int batch_size,
+                                activation_mode_t active_gate) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
       hl_avx_gru_backward_reset_grad(
-          opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
-          grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
     } else {
       hl_naive_gru_backward_reset_grad(
-          opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
-          grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
     }
 
-    value.gateValue += frameSize * 3;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
+    value.gate_value += frame_size * 3;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
     }
 
-    grad.gateGrad += frameSize * 3;
-    grad.resetOutputGrad += frameSize;
-    if (grad.prevOutGrad) {
-      grad.prevOutGrad += frameSize;
+    grad.gate_grad += frame_size * 3;
+    grad.reset_output_grad += frame_size;
+    if (grad.prev_out_grad) {
+      grad.prev_out_grad += frame_size;
     }
   }
 }
diff --git a/paddle/operators/math/detail/gru_gpu_kernel.h b/paddle/operators/math/detail/gru_gpu_kernel.h
index 6441c648b048422c110872a85aa8cb719f11a8d7..d2edcb7f258b387530799b967fc0fff61acc5b83 100644
--- a/paddle/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/operators/math/detail/gru_gpu_kernel.h
@@ -27,174 +27,174 @@ namespace math {
 namespace detail {
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class OpResetOutput, bool isBatch, typename T>
-__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
-                                        T *gateValue, T *resetOutputValue,
-                                        T *prevOutputValue, int frameSize,
-                                        int batchSize,
+template <class OpResetOutput, bool is_batch, typename T>
+__global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
+                                        T *gate_value, T *reset_output_value,
+                                        T *prev_output_value, int frame_size,
+                                        int batch_size,
                                         activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    resetOutputValue += batchIdx * frameSize;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    reset_output_value += batch_idx * frame_size;
   }
 
-  T rPrevOut = 0;
-  T rValueResetOutput;
-  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
-  T rValueResetGate = gateValue[frameIdx + frameSize * 1];
+  T r_prev_out = 0;
+  T r_value_reset_output;
+  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
+  T r_value_reset_gate = gate_value[frame_idx + frame_size * 1];
 
-  if (prevOutputValue) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
-    rPrevOut = prevOutputValue[frameIdx];
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    r_prev_out = prev_output_value[frame_idx];
   }
 
-  opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, rValueResetOutput,
-                active_gate);
+  op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                  r_value_reset_output, active_gate);
 
-  gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
-  gateValue[frameIdx + frameSize * 1] = rValueResetGate;
-  resetOutputValue[frameIdx] = rValueResetOutput;
+  gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
+  gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
+  reset_output_value[frame_idx] = r_value_reset_output;
 }
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class OpFinalOutput, bool isBatch, typename T>
-__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
-                                        T *gateValue, T *prevOutputValue,
-                                        T *outputValue, int frameSize,
-                                        int batchSize,
+template <class OpFinalOutput, bool is_batch, typename T>
+__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
+                                        T *gate_value, T *prev_output_value,
+                                        T *output_value, int frame_size,
+                                        int batch_size,
                                         activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    outputValue += batchIdx * frameSize;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    output_value += batch_idx * frame_size;
   }
 
-  T rOutput;
-  T rPrevOut = 0;
-  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
-  T rValueFrameState = gateValue[frameIdx + frameSize * 2];
+  T r_output;
+  T r_prev_out = 0;
+  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
+  T r_value_frame_state = gate_value[frame_idx + frame_size * 2];
 
-  if (prevOutputValue) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
-    rPrevOut = prevOutputValue[frameIdx];
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    r_prev_out = prev_output_value[frame_idx];
   }
 
-  opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
-                active_node);
+  op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                  r_output, active_node);
 
-  gateValue[frameIdx + frameSize * 2] = rValueFrameState;
-  outputValue[frameIdx] = rOutput;
+  gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
+  output_value[frame_idx] = r_output;
 }
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class OpStateGrad, bool isBatch, typename T>
-__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad, T *gateValue,
-                                       T *gateGrad, T *prevOutValue,
-                                       T *prevOutGrad, T *outputGrad,
-                                       int frameSize, int batchSize,
+template <class OpStateGrad, bool is_batch, typename T>
+__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
+                                       T *gate_grad, T *prev_out_value,
+                                       T *prev_out_grad, T *output_grad,
+                                       int frame_size, int batch_size,
                                        activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    gateGrad += batchIdx * 3 * frameSize;
-    outputGrad += batchIdx * frameSize;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    gate_grad += batch_idx * 3 * frame_size;
+    output_grad += batch_idx * frame_size;
   }
 
-  T rUpdateGateGrad;
-  T rFrameStateGrad;
-  T rPrevOutValue = 0;
-  T rPrevOutGrad = 0;
-  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
-  T rFrameStateValue = gateValue[frameIdx + frameSize * 2];
-  T rOutGrad = outputGrad[frameIdx];
+  T r_update_gate_grad;
+  T r_frame_state_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T r_frame_state_value = gate_value[frame_idx + frame_size * 2];
+  T r_out_grad = output_grad[frame_idx];
 
-  if (prevOutValue && prevOutGrad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
-    rPrevOutValue = prevOutValue[frameIdx];
+  if (prev_out_value && prev_out_grad) {
+    if (is_batch) prev_out_value += batch_idx * frame_size;
+    r_prev_out_value = prev_out_value[frame_idx];
 
-    if (isBatch) prevOutGrad += batchIdx * frameSize;
-    rPrevOutGrad = prevOutGrad[frameIdx];
+    if (is_batch) prev_out_grad += batch_idx * frame_size;
+    r_prev_out_grad = prev_out_grad[frame_idx];
   }
 
-  opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
-              rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
-              active_node);
+  op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                r_out_grad, active_node);
 
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
-  gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
-  if (prevOutGrad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
+  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
+  gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
+  if (prev_out_grad) {
+    prev_out_grad[frame_idx] = r_prev_out_grad;
   }
 }
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class OpResetGrad, bool isBatch, typename T>
-__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad, T *gateValue,
-                                       T *gateGrad, T *prevOutValue,
-                                       T *prevOutGrad, T *resetOutputGrad,
-                                       int frameSize, int batchSize,
+template <class OpResetGrad, bool is_batch, typename T>
+__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
+                                       T *gate_grad, T *prev_out_value,
+                                       T *prev_out_grad, T *reset_output_grad,
+                                       int frame_size, int batch_size,
                                        activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    gateGrad += batchIdx * 3 * frameSize;
-    resetOutputGrad += batchIdx * frameSize;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    gate_grad += batch_idx * 3 * frame_size;
+    reset_output_grad += batch_idx * frame_size;
   }
 
-  T rResetGateGrad;
-  T rPrevOutValue = 0;
-  T rPrevOutGrad = 0;
-  T rResetOutputGrad = 0;
-  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
-  T rUpdateGateGrad = gateGrad[frameIdx + frameSize * 0];
-  T rResetGateValue = gateValue[frameIdx + frameSize * 1];
-
-  if (prevOutValue && prevOutGrad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
-    if (isBatch) prevOutGrad += batchIdx * frameSize;
-    rPrevOutValue = prevOutValue[frameIdx];
-    rPrevOutGrad = prevOutGrad[frameIdx];
-    rResetOutputGrad = resetOutputGrad[frameIdx];
+  T r_reset_gate_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T r_reset_output_grad = 0;
+  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T r_update_gate_grad = gate_grad[frame_idx + frame_size * 0];
+  T r_reset_gate_value = gate_value[frame_idx + frame_size * 1];
+
+  if (prev_out_value && prev_out_grad) {
+    if (is_batch) prev_out_value += batch_idx * frame_size;
+    if (is_batch) prev_out_grad += batch_idx * frame_size;
+    r_prev_out_value = prev_out_value[frame_idx];
+    r_prev_out_grad = prev_out_grad[frame_idx];
+    r_reset_output_grad = reset_output_grad[frame_idx];
   }
 
-  opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
-              rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
-              active_gate);
+  op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                r_reset_output_grad, active_gate);
 
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
-  gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
-  if (prevOutGrad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
+  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
+  gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
+  if (prev_out_grad) {
+    prev_out_grad[frame_idx] = r_prev_out_grad;
   }
 }
 }  // namespace detail
diff --git a/paddle/operators/math/detail/gru_kernel.h b/paddle/operators/math/detail/gru_kernel.h
index 8a681d8d8bced72e1296f863489f6ccbc7913167..acd84be01db9ddaf06d165d8be353b253f324dd2 100644
--- a/paddle/operators/math/detail/gru_kernel.h
+++ b/paddle/operators/math/detail/gru_kernel.h
@@ -28,23 +28,25 @@ namespace forward {
 template <typename T>
 class gru_resetOutput {
  public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueResetGate, T &prevOut,
-                             T &valueResetOutput, activation_mode_t actGate) {
-    valueUpdateGate = activation(valueUpdateGate, actGate);
-    valueResetGate = activation(valueResetGate, actGate);
-    valueResetOutput = prevOut * valueResetGate;
+  HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate,
+                             T &prev_out, T &value_reset_output,
+                             activation_mode_t act_gate) {
+    value_update_gate = activation(value_update_gate, act_gate);
+    value_reset_gate = activation(value_reset_gate, act_gate);
+    value_reset_output = prev_out * value_reset_gate;
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueResetGate,
-                             __m256 &prevOut, __m256 &valueResetOutput,
-                             activation_mode_t actGate) {
-    valueUpdateGate = activation(valueUpdateGate, actGate);
-    valueResetGate = activation(valueResetGate, actGate);
-    valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate);
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &value_reset_gate, __m256 &prev_out,
+                             __m256 &value_reset_output,
+                             activation_mode_t act_gate) {
+    value_update_gate = activation(value_update_gate, act_gate);
+    value_reset_gate = activation(value_reset_gate, act_gate);
+    value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate);
   }
 #endif
 #endif
@@ -53,24 +55,26 @@ class gru_resetOutput {
 template <typename T>
 class gru_finalOutput {
  public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueFrameState, T &prevOut,
-                             T &valueOutput, activation_mode_t actInput) {
-    valueFrameState = activation(valueFrameState, actInput);
-    valueOutput = prevOut - (valueUpdateGate * prevOut) +
-                  (valueUpdateGate * valueFrameState);
+  HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state,
+                             T &prev_out, T &value_output,
+                             activation_mode_t act_input) {
+    value_frame_state = activation(value_frame_state, act_input);
+    value_output = prev_out - (value_update_gate * prev_out) +
+                   (value_update_gate * value_frame_state);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueFrameState,
-                             __m256 &prevOut, __m256 &valueOutput,
-                             activation_mode_t actInput) {
-    valueFrameState = activation(valueFrameState, actInput);
-    valueOutput = _mm256_add_ps(
-        _mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)),
-        _mm256_mul_ps(valueUpdateGate, valueFrameState));
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &value_frame_state, __m256 &prev_out,
+                             __m256 &value_output,
+                             activation_mode_t act_input) {
+    value_frame_state = activation(value_frame_state, act_input);
+    value_output = _mm256_add_ps(
+        _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)),
+        _mm256_mul_ps(value_update_gate, value_frame_state));
   }
 #endif
 #endif
@@ -82,34 +86,37 @@ namespace backward {
 template <typename T>
 class gru_stateGrad {
  public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
-                             T &valueFrameState, T &gradFrameState,
-                             T &valuePrevOut, T &gradPrevOut, T &gradOutput,
-                             activation_mode_t actInput) {
-    gradUpdateGate = (gradOutput * valueFrameState);
-    gradUpdateGate -= (gradOutput * valuePrevOut);
-    gradPrevOut -= (gradOutput * valueUpdateGate);
-    gradPrevOut += gradOutput;
-    gradFrameState =
-        activation(gradOutput * valueUpdateGate, valueFrameState, actInput);
+  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
+                             T &value_frame_state, T &grad_frame_state,
+                             T &value_prev_out, T &grad_prev_out,
+                             T &grad_output, activation_mode_t act_input) {
+    grad_update_gate = (grad_output * value_frame_state);
+    grad_update_gate -= (grad_output * value_prev_out);
+    grad_prev_out -= (grad_output * value_update_gate);
+    grad_prev_out += grad_output;
+    grad_frame_state = activation(grad_output * value_update_gate,
+                                  value_frame_state, act_input);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
-                             __m256 &valueFrameState, __m256 &gradFrameState,
-                             __m256 &valuePrevOut, __m256 &gradPrevOut,
-                             __m256 &gradOutput, activation_mode_t actInput) {
-    gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState);
-    gradUpdateGate =
-        _mm256_sub_ps(gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut));
-    gradPrevOut = _mm256_add_ps(
-        _mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)),
-        gradOutput);
-    gradFrameState = activation(_mm256_mul_ps(gradOutput, valueUpdateGate),
-                                valueFrameState, actInput);
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &grad_update_gate,
+                             __m256 &value_frame_state,
+                             __m256 &grad_frame_state, __m256 &value_prev_out,
+                             __m256 &grad_prev_out, __m256 &grad_output,
+                             activation_mode_t act_input) {
+    grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state);
+    grad_update_gate = _mm256_sub_ps(
+        grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out));
+    grad_prev_out = _mm256_add_ps(
+        _mm256_sub_ps(grad_prev_out,
+                      _mm256_mul_ps(grad_output, value_update_gate)),
+        grad_output);
+    grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate),
+                                  value_frame_state, act_input);
   }
 #endif
 #endif
@@ -118,30 +125,32 @@ class gru_stateGrad {
 template <typename T>
 class gru_resetGrad {
  public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
-                             T &valueResetGate, T &gradResetGate,
-                             T &valuePrevOut, T &gradPrevOut,
-                             T &gradResetOutput, activation_mode_t actGate) {
-    gradResetGate = (gradResetOutput * valuePrevOut);
-    gradPrevOut += (gradResetOutput * valueResetGate);
-    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
-    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
+                             T &value_reset_gate, T &grad_reset_gate,
+                             T &value_prev_out, T &grad_prev_out,
+                             T &grad_reset_output, activation_mode_t act_gate) {
+    grad_reset_gate = (grad_reset_output * value_prev_out);
+    grad_prev_out += (grad_reset_output * value_reset_gate);
+    grad_update_gate =
+        activation(grad_update_gate, value_update_gate, act_gate);
+    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
-                             __m256 &valueResetGate, __m256 &gradResetGate,
-                             __m256 &valuePrevOut, __m256 &gradPrevOut,
-                             __m256 &gradResetOutput,
-                             activation_mode_t actGate) {
-    gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut);
-    gradPrevOut = _mm256_add_ps(gradPrevOut,
-                                _mm256_mul_ps(gradResetOutput, valueResetGate));
-    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
-    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &grad_update_gate, __m256 &value_reset_gate,
+                             __m256 &grad_reset_gate, __m256 &value_prev_out,
+                             __m256 &grad_prev_out, __m256 &grad_reset_output,
+                             activation_mode_t act_gate) {
+    grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out);
+    grad_prev_out = _mm256_add_ps(
+        grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate));
+    grad_update_gate =
+        activation(grad_update_gate, value_update_gate, act_gate);
+    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
   }
 #endif
 #endif
diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc
index 125af449d3f700e24be5e4b7615c3b0e03fd4e5b..ae4e47b014a9cd1f656dd9332086aa4d1b7cbb52 100644
--- a/paddle/operators/math/gru_compute.cc
+++ b/paddle/operators/math/gru_compute.cc
@@ -21,29 +21,29 @@ namespace math {
 template <typename T>
 struct GRUUnitFunctor<platform::CPUPlace, T> {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      hl_gru_value<T> value, int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate) {
 #ifndef __NVCC__
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
       math::gemm<platform::CPUPlace, T>(
-          context, false, false, batchSize, frameSize * 2, frameSize, 1,
-          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
-          value.gateValue, frameSize * 3);
+          context, false, false, batch_size, frame_size * 2, frame_size, 1,
+          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
+          1, value.gate_value, frame_size * 3);
     }
 
     detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
-                                 frameSize, batchSize, active_gate);
+                                 frame_size, batch_size, active_gate);
 
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
       math::gemm<platform::CPUPlace, T>(
-          context, false, false, batchSize, frameSize, frameSize, 1,
-          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
-          value.gateValue + frameSize * 2, frameSize * 3);
+          context, false, false, batch_size, frame_size, frame_size, 1,
+          value.reset_output_value, frame_size, value.state_weight, frame_size,
+          1, value.gate_value + frame_size * 2, frame_size * 3);
     }
 
     detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
-                                 frameSize, batchSize, active_node);
+                                 frame_size, batch_size, active_node);
 #endif
   }
 };
@@ -51,41 +51,43 @@ struct GRUUnitFunctor<platform::CPUPlace, T> {
 template <typename T>
 struct GRUUnitGradFunctor<platform::CPUPlace, T> {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
-                      int batchSize, activation_mode_t active_node,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad,
+                      int frame_size, int batch_size,
+                      activation_mode_t active_node,
                       activation_mode_t active_gate) {
 #ifndef __NVCC__
     detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
-                                grad, frameSize, batchSize, active_node);
+                                grad, frame_size, batch_size, active_node);
 
-    if (value.prevOutValue && grad.prevOutGrad) {
+    if (value.prev_out_value && grad.prev_out_grad) {
       math::gemm<platform::CPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize, 1,
-          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
-          frameSize, 0, grad.resetOutputGrad, frameSize);
+          context, false, true, batch_size, frame_size, frame_size, 1,
+          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
+          frame_size, 0, grad.reset_output_grad, frame_size);
 
-      if (grad.stateWeightGrad) {
+      if (grad.state_weight_grad) {
         math::gemm<platform::CPUPlace, T>(
-            context, true, false, frameSize, frameSize, batchSize, 1,
-            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
-            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
+            context, true, false, frame_size, frame_size, batch_size, 1,
+            value.reset_output_value, frame_size,
+            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+            grad.state_weight_grad, frame_size);
       }
     }
 
     detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
-                                grad, frameSize, batchSize, active_gate);
+                                grad, frame_size, batch_size, active_gate);
 
-    if (grad.prevOutGrad && value.prevOutValue) {
+    if (grad.prev_out_grad && value.prev_out_value) {
       math::gemm<platform::CPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize * 2, 1,
-          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
-          grad.prevOutGrad, frameSize);
+          context, false, true, batch_size, frame_size, frame_size * 2, 1,
+          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
+          grad.prev_out_grad, frame_size);
 
-      if (grad.gateWeightGrad) {
+      if (grad.gate_weight_grad) {
         math::gemm<platform::CPUPlace, T>(
-            context, true, false, frameSize, frameSize * 2, batchSize, 1,
-            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
-            grad.gateWeightGrad, frameSize * 2);
+            context, true, false, frame_size, frame_size * 2, batch_size, 1,
+            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
+            grad.gate_weight_grad, frame_size * 2);
       }
     }
 #endif
diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu
index 7b9e54ac029f6aa00553338435684097d6d02b25..0252bdbdb63fef2e4754057fc5b6d415cef0c29f 100644
--- a/paddle/operators/math/gru_compute.cu
+++ b/paddle/operators/math/gru_compute.cu
@@ -21,66 +21,66 @@ namespace math {
 template <typename T>
 struct GRUUnitFunctor<platform::GPUPlace, T> {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      hl_gru_value<T> value, int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate) {
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
     dim3 threads;
     dim3 grid;
-    if (batchSize == 1) {
-      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-      int frameBlocks = (frameSize + 1024 - 1) / 1024;
-      threads = dim3(framePerBlock, 1);
-      grid = dim3(frameBlocks, 1);
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grid = dim3(frame_blocks, 1);
     } else {
       threads = dim3(32, 32);
-      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
     }
 
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
       math::gemm<platform::GPUPlace, T>(
-          context, false, false, batchSize, frameSize * 2, frameSize, 1,
-          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
-          value.gateValue, frameSize * 3);
+          context, false, false, batch_size, frame_size * 2, frame_size, 1,
+          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
+          1, value.gate_value, frame_size * 3);
     }
 
-    if (batchSize == 1) {
+    if (batch_size == 1) {
       detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* isBatch= */ false,
+                                      /* is_batch= */ false,
                                       T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gateValue,
-          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
-          active_gate);
+          detail::forward::gru_resetOutput<T>(), value.gate_value,
+          value.reset_output_value, value.prev_out_value, frame_size,
+          batch_size, active_gate);
     } else {
       detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* isBatch= */ true,
+                                      /* is_batch= */ true,
                                       T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gateValue,
-          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
-          active_gate);
+          detail::forward::gru_resetOutput<T>(), value.gate_value,
+          value.reset_output_value, value.prev_out_value, frame_size,
+          batch_size, active_gate);
     }
 
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
       math::gemm<platform::GPUPlace, T>(
-          context, false, false, batchSize, frameSize, frameSize, 1,
-          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
-          value.gateValue + frameSize * 2, frameSize * 3);
+          context, false, false, batch_size, frame_size, frame_size, 1,
+          value.reset_output_value, frame_size, value.state_weight, frame_size,
+          1, value.gate_value + frame_size * 2, frame_size * 3);
     }
 
-    if (batchSize == 1) {
+    if (batch_size == 1) {
       detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* isBatch= */ false,
+                                      /* is_batch= */ false,
                                       T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gateValue,
-          value.prevOutValue, value.outputValue, frameSize, batchSize,
+          detail::forward::gru_finalOutput<T>(), value.gate_value,
+          value.prev_out_value, value.output_value, frame_size, batch_size,
           active_node);
     } else {
       detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* isBatch= */ true,
+                                      /* is_batch= */ true,
                                       T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gateValue,
-          value.prevOutValue, value.outputValue, frameSize, batchSize,
+          detail::forward::gru_finalOutput<T>(), value.gate_value,
+          value.prev_out_value, value.output_value, frame_size, batch_size,
           active_node);
     }
   }
@@ -89,80 +89,82 @@ struct GRUUnitFunctor<platform::GPUPlace, T> {
 template <typename T>
 struct GRUUnitGradFunctor<platform::GPUPlace, T> {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
-                      int batchSize, activation_mode_t active_node,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad,
+                      int frame_size, int batch_size,
+                      activation_mode_t active_node,
                       activation_mode_t active_gate) {
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
     dim3 threads;
     dim3 grid;
-    if (batchSize == 1) {
-      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-      int frameBlocks = (frameSize + 1024 - 1) / 1024;
-      threads = dim3(framePerBlock, 1);
-      grid = dim3(frameBlocks, 1);
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grid = dim3(frame_blocks, 1);
     } else {
       threads = dim3(32, 32);
-      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
     }
 
-    if (batchSize == 1) {
+    if (batch_size == 1) {
       detail::KeGruBackwardStateGrad<
           detail::backward::gru_stateGrad<T>,
-          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
-          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
-          batchSize, active_node);
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.output_grad, frame_size, batch_size, active_node);
     } else {
       detail::KeGruBackwardStateGrad<
           detail::backward::gru_stateGrad<T>,
-          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
-          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
-          batchSize, active_node);
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.output_grad, frame_size, batch_size, active_node);
     }
 
-    if (value.prevOutValue && grad.prevOutGrad) {
+    if (value.prev_out_value && grad.prev_out_grad) {
       math::gemm<platform::GPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize, 1,
-          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
-          frameSize, 0, grad.resetOutputGrad, frameSize);
+          context, false, true, batch_size, frame_size, frame_size, 1,
+          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
+          frame_size, 0, grad.reset_output_grad, frame_size);
 
-      if (grad.stateWeightGrad) {
+      if (grad.state_weight_grad) {
         math::gemm<platform::GPUPlace, T>(
-            context, true, false, frameSize, frameSize, batchSize, 1,
-            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
-            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
+            context, true, false, frame_size, frame_size, batch_size, 1,
+            value.reset_output_value, frame_size,
+            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+            grad.state_weight_grad, frame_size);
       }
     }
 
-    if (batchSize == 1) {
+    if (batch_size == 1) {
       detail::KeGruBackwardResetGrad<
           detail::backward::gru_resetGrad<T>,
-          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
-          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
-          batchSize, active_gate);
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.reset_output_grad, frame_size, batch_size, active_gate);
     } else {
       detail::KeGruBackwardResetGrad<
           detail::backward::gru_resetGrad<T>,
-          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
-          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
-          batchSize, active_gate);
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.reset_output_grad, frame_size, batch_size, active_gate);
     }
 
-    if (grad.prevOutGrad && value.prevOutValue) {
+    if (grad.prev_out_grad && value.prev_out_value) {
       math::gemm<platform::GPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize * 2, 1,
-          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
-          grad.prevOutGrad, frameSize);
+          context, false, true, batch_size, frame_size, frame_size * 2, 1,
+          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
+          grad.prev_out_grad, frame_size);
 
-      if (grad.gateWeightGrad) {
+      if (grad.gate_weight_grad) {
         math::gemm<platform::GPUPlace, T>(
-            context, true, false, frameSize, frameSize * 2, batchSize, 1,
-            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
-            grad.gateWeightGrad, frameSize * 2);
+            context, true, false, frame_size, frame_size * 2, batch_size, 1,
+            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
+            grad.gate_weight_grad, frame_size * 2);
       }
     }
   }
diff --git a/paddle/operators/math/gru_compute.h b/paddle/operators/math/gru_compute.h
index 1475fb38104f353857dfd968e46af98a6d52c52a..58ea59f68e91c647a6b29ce3e8bc7e5d25db9b9b 100644
--- a/paddle/operators/math/gru_compute.h
+++ b/paddle/operators/math/gru_compute.h
@@ -22,28 +22,28 @@ namespace math {
 // TODO(guosheng): refine code style in gru_compute
 template <typename T>
 struct hl_gru_value {
-  T *gateWeight;
-  T *stateWeight;
-  T *gateValue;
-  T *resetOutputValue;
-  T *outputValue;
-  T *prevOutValue;
+  T *gate_weight;
+  T *state_weight;
+  T *gate_value;
+  T *reset_output_value;
+  T *output_value;
+  T *prev_out_value;
 };
 
 template <typename T>
 struct hl_gru_grad {
-  T *gateWeightGrad;
-  T *stateWeightGrad;
-  T *gateGrad;
-  T *resetOutputGrad;
-  T *outputGrad;
-  T *prevOutGrad;
+  T *gate_weight_grad;
+  T *state_weight_grad;
+  T *gate_grad;
+  T *reset_output_grad;
+  T *output_grad;
+  T *prev_out_grad;
 };
 
 template <typename Place, typename T>
 struct GRUUnitFunctor {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      hl_gru_value<T> value, int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate);
 };
@@ -51,8 +51,9 @@ struct GRUUnitFunctor {
 template <typename Place, typename T>
 struct GRUUnitGradFunctor {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
-                      int batchSize, activation_mode_t active_node,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad,
+                      int frame_size, int batch_size,
+                      activation_mode_t active_node,
                       activation_mode_t active_gate);
 };
 
diff --git a/paddle/operators/math/maxouting.cc b/paddle/operators/math/maxouting.cc
index e5168ce7afd4139475afa6edd5999b9974407c9b..c9003962d33b70b8e21a0d6b78bf5a77981df409 100644
--- a/paddle/operators/math/maxouting.cc
+++ b/paddle/operators/math/maxouting.cc
@@ -23,8 +23,7 @@ template <typename T>
 class MaxOutFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor * output,
+                  const framework::Tensor& input, framework::Tensor* output,
                   int groups) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
@@ -37,34 +36,30 @@ class MaxOutFunctor<platform::CPUPlace, T> {
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; ++i) {
-      int new_bindex =  c_size * i;
+      int new_bindex = c_size * i;
       for (int c = 0; c < output_channels; ++c) {
         int new_cindex = fea_size * c;
         for (int f = 0; f < fea_size; ++f) {
           T ele = static_cast<T>(-FLT_MAX);
           for (int ph = 0; ph < groups; ++ph) {
-            T x = input_data[(new_bindex + new_cindex) * groups
-              + ph * fea_size + f];
+            T x = input_data[(new_bindex + new_cindex) * groups +
+                             ph * fea_size + f];
             ele = ele > x ? ele : x;
           }
-          output_data[(new_bindex+new_cindex+f)] = ele;
+          output_data[(new_bindex + new_cindex + f)] = ele;
         }
       }
     }
   }
 };
 
-
-
 template <class T>
 class MaxOutGradFunctor<platform::CPUPlace, T> {
-public:
+ public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor * input_grad,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  int groups) {
+                  const framework::Tensor& output_grad, int groups) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -84,11 +79,11 @@ public:
           bool continue_match = true;
           int output_idx = blen + clen + f;
           for (int g = 0; g < groups && continue_match; ++g) {
-              int input_idx = input_idx0 + fea_size * g;
-              if (input_data[input_idx] == output_data[output_idx]) {
-                input_grad_data[input_idx] += output_grad_data[output_idx];
-                continue_match = false;
-              }
+            int input_idx = input_idx0 + fea_size * g;
+            if (input_data[input_idx] == output_data[output_idx]) {
+              input_grad_data[input_idx] += output_grad_data[output_idx];
+              continue_match = false;
+            }
           }
         }
       }
diff --git a/paddle/operators/math/maxouting.cu b/paddle/operators/math/maxouting.cu
index 7c698577b8a8258a58ba9a2b6c675457b2458a5b..c3fabcae081e24d92d50d0e2a2cad4a2e9872125 100644
--- a/paddle/operators/math/maxouting.cu
+++ b/paddle/operators/math/maxouting.cu
@@ -21,9 +21,9 @@ namespace math {
 
 template <typename T>
 __global__ void KernelMaxOut(const int nthreads, const T* input_data,
-                            const int channels,
-                             const int input_height, const int input_width,
-                             int groups, T* output_data ) {
+                             const int channels, const int input_height,
+                             const int input_width, int groups,
+                             T* output_data) {
   const int size = input_height * input_width * channels / groups;
   const int feat_len = input_height * input_width;
   int index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -34,7 +34,7 @@ __global__ void KernelMaxOut(const int nthreads, const T* input_data,
     int channel_idx = batch_offset / feat_len;
     int feat_idx = batch_offset % feat_len;
     int data_idx =
-      (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
+        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
     T ele = static_cast<T>(-FLT_MAX);
     for (int g = 0; g < groups; ++g) {
       T x = input_data[data_idx + g * feat_len];
@@ -44,34 +44,35 @@ __global__ void KernelMaxOut(const int nthreads, const T* input_data,
   }
 }
 template <typename T>
-__global__ void KernelMaxoutGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, T* input_grad, const int channels,
-    const int input_height, const int input_width, int groups) {
-    const int size = input_height * input_width * channels / groups;
-    const int feat_len = input_height * input_width;
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int offset = blockDim.x * gridDim.x;
-    for (int i = index; i < nthreads; i += offset) {
-      int batch_idx = i / size;
-      int batch_offset = i % size;
-      int channel_idx = batch_offset / feat_len;
-      int feat_idx = batch_offset % feat_len;
-      int data_idx =
+__global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
+                                 const T* output_data, const T* output_grad,
+                                 T* input_grad, const int channels,
+                                 const int input_height, const int input_width,
+                                 int groups) {
+  const int size = input_height * input_width * channels / groups;
+  const int feat_len = input_height * input_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int batch_idx = i / size;
+    int batch_offset = i % size;
+    int channel_idx = batch_offset / feat_len;
+    int feat_idx = batch_offset % feat_len;
+    int data_idx =
         (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
-      int max_index = -1;
-      bool continue_match = true;
-      for (int g = 0; g < groups && continue_match; ++g) {
-        if (input_data[data_idx + g * feat_len] == output_data[i]) {
-          max_index = data_idx + g * feat_len;
-          continue_match = false;
-          break;
-        }
-      }
-      if (max_index != -1) {
-        input_grad[max_index] += output_grad[index];
+    int max_index = -1;
+    bool continue_match = true;
+    for (int g = 0; g < groups && continue_match; ++g) {
+      if (input_data[data_idx + g * feat_len] == output_data[i]) {
+        max_index = data_idx + g * feat_len;
+        continue_match = false;
+        break;
       }
     }
+    if (max_index != -1) {
+      input_grad[max_index] += output_grad[index];
+    }
+  }
 }
 /*
  * All tensors are in NCHW format.
@@ -80,7 +81,7 @@ template <typename T>
 class MaxOutFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor * output,
+                  const framework::Tensor& input, framework::Tensor* output,
                   int groups) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -92,7 +93,7 @@ class MaxOutFunctor<platform::GPUPlace, T> {
 
     const T* input_data = input.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
-    int nthreads =  output->numel();
+    int nthreads = output->numel();
     int blocks = (nthreads + 1024 - 1) / 1024;
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
@@ -101,8 +102,7 @@ class MaxOutFunctor<platform::GPUPlace, T> {
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(nthreads, input_data, input_channels,
-                              input_height, input_width, groups,
-                              output_data);
+                              input_height, input_width, groups, output_data);
   }
 };
 /*
@@ -112,11 +112,9 @@ template <typename T>
 class MaxOutGradFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor * input_grad,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  int groups) {
+                  const framework::Tensor& output_grad, int groups) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -129,7 +127,7 @@ class MaxOutGradFunctor<platform::GPUPlace, T> {
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-    int nthreads =  output.numel();
+    int nthreads = output.numel();
     int blocks = (nthreads + 1024 - 1) / 1024;
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
@@ -137,9 +135,9 @@ class MaxOutGradFunctor<platform::GPUPlace, T> {
     KernelMaxoutGrad<
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_height, input_width, groups);
+                 .stream()>>>(nthreads, input_data, output_data,
+                              output_grad_data, input_grad_data, input_channels,
+                              input_height, input_width, groups);
   }
 };
 
diff --git a/paddle/operators/math/maxouting.h b/paddle/operators/math/maxouting.h
index d4c9da38ab8f8d88ed461d805ae64a015db968c4..2d9069b0b3ca3e7bad3b21a46985c52ef00f50e6 100644
--- a/paddle/operators/math/maxouting.h
+++ b/paddle/operators/math/maxouting.h
@@ -21,15 +21,14 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-#define FLT_MAX \
-    __FLT_MAX__
+#define FLT_MAX __FLT_MAX__
 
 template <typename Place, typename T>
 
 class MaxOutFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor * output,
+                  const framework::Tensor& input, framework::Tensor* output,
                   int groups);
 };
 
@@ -37,8 +36,7 @@ template <typename Place, class T>
 class MaxOutGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor * input_grad,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, int groups);
 };
diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b57d3dc1414cff492db8d7d503a7fce370a3f151
--- /dev/null
+++ b/paddle/operators/math/unpooling.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/unpooling.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+class Unpool2dMaxFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          output_data[index] = input_data[i];
+        }
+        input_data += input_feasize;
+        indices_data += input_feasize;
+        output_data += output_feasize;
+      }
+    }
+  }
+};
+template <class T>
+class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const int* indices_data = indices.data<int>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          input_grad_data[i] = output_grad_data[index];
+        }
+        input_grad_data += input_feasize;
+        indices_data += input_feasize;
+        output_grad_data += output_feasize;
+      }
+    }
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, float>;
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, double>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, float>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
new file mode 100644
index 0000000000000000000000000000000000000000..37c3c8b689f9a69b68ddffd23813fa9ad8ced0e7
--- /dev/null
+++ b/paddle/operators/math/unpooling.cu
@@ -0,0 +1,134 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/unpooling.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
+                                  const int* indices_data,
+                                  const int input_height, const int input_width,
+                                  const int channels, T* output_data,
+                                  const int output_height,
+                                  const int output_width) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    output_data[out_offset + out_index] = input_data[i];
+  }
+}
+template <typename T>
+__global__ void KernelUnpool2dMaxGrad(
+    const int nthreads, const T* input_data, const int* indices_data,
+    const int input_height, const int input_width, const int channels,
+    const T* output_data, const T* output_grad, const int output_height,
+    const int output_width, T* input_grad) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    input_grad[i] = output_grad[out_offset + out_index];
+  }
+}
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMax<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(input.numel(), input_data, indices_data,
+                              input_height, input_width, output_channels,
+                              output_data, output_height, output_width);
+  }
+};
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMaxGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(input.numel(), input_data, indices_data,
+                              input_height, input_width, output_channels,
+                              output_data, output_grad_data, output_height,
+                              output_width, input_grad_data);
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, double>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, float>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..7077d7c2274fd9e02b69ef343f310f4ffbbcff1a
--- /dev/null
+++ b/paddle/operators/math/unpooling.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename Place, typename T>
+class Unpool2dMaxFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output);
+};
+template <typename Place, class T>
+class Unpool2dMaxGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/maxout_op.cc b/paddle/operators/maxout_op.cc
index 95467f2e69093906980d075b6a41c5d2934dd5a2..e203a25d544372220e8246e5e17ffbc6408d2998 100644
--- a/paddle/operators/maxout_op.cc
+++ b/paddle/operators/maxout_op.cc
@@ -22,16 +22,17 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   MaxOutOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
+    AddInput(
+        "X",
         "(Tensor) The input tensor of maxout operator. "
         "The format of input tensor is NCHW. Where N is batch size, C is the "
         "number of channels, H and W is the height and width of feature.");
     AddOutput("Out",
-        "(Tensor) The output tensor of maxout operator."
-        "The format of output tensor is also NCHW."
-        "Where N is batch size, C is "
-        "the number of channels, H and W is the height and "
-        "width of feature.");
+              "(Tensor) The output tensor of maxout operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
     AddAttr<int>(
         "groups",
         R"DOC("Specifies how many groups the input tensor will be split"
@@ -59,21 +60,19 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-
 class MaxOutOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MaxoutOp"
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of MaxoutOp"
                    "should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of MaxoutOp should not be null.");
     auto in_x_dims = ctx->GetInputDim("X");
     int groups = ctx->Attrs().Get<int>("groups");
     // check groups > 1
-    PADDLE_ENFORCE_GT(
-        groups, 1,
-        "groups should be larger than 1 in maxoutop");
+    PADDLE_ENFORCE_GT(groups, 1, "groups should be larger than 1 in maxoutop");
     std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1] / groups});
     output_shape.push_back(in_x_dims[2]);
     output_shape.push_back(in_x_dims[3]);
@@ -87,18 +86,17 @@ class MaxOutOpGrad : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-    "Input(X@GRAD) should not be null.");
+                   "Input(X@GRAD) should not be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
-}    // namespace operators
-}    // namespace paddle
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
-                        ops::MaxOutOpGrad);
-REGISTER_OP_CPU_KERNEL(maxout, ops::MaxOutKernel<paddle::platform::CPUPlace,
-                       float>);
-REGISTER_OP_CPU_KERNEL(maxout_grad,
-                       ops::MaxOutGradKernel<paddle::platform::CPUPlace,
-                       float>);
+            ops::MaxOutOpGrad);
+REGISTER_OP_CPU_KERNEL(maxout,
+                       ops::MaxOutKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    maxout_grad, ops::MaxOutGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/maxout_op.cu.cc b/paddle/operators/maxout_op.cu.cc
index a5823fba6848a0d42a743c90d7d683e3e4ae4422..decd43913d69d122330886e07178778d03f7fef5 100644
--- a/paddle/operators/maxout_op.cu.cc
+++ b/paddle/operators/maxout_op.cu.cc
@@ -18,8 +18,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(maxout,
                        ops::MaxOutKernel<paddle::platform::GPUPlace, float>,
                        ops::MaxOutKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(maxout_grad,
-                       ops::MaxOutGradKernel<paddle::platform::GPUPlace,
-                        float>,
-                       ops::MaxOutGradKernel<paddle::platform::GPUPlace,
-                        double>);
+REGISTER_OP_GPU_KERNEL(
+    maxout_grad, ops::MaxOutGradKernel<paddle::platform::GPUPlace, float>,
+    ops::MaxOutGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/maxout_op.h b/paddle/operators/maxout_op.h
index c404cd16a9b2372ea4c6a17eb5ac82cf8f3bf27c..44a0d073dda642f6e261ce5760013f3e1055f43d 100644
--- a/paddle/operators/maxout_op.h
+++ b/paddle/operators/maxout_op.h
@@ -53,7 +53,7 @@ class MaxOutGradKernel : public framework::OpKernel<T> {
       zero(device_ctx, in_x_grad, static_cast<T>(0.0));
       math::MaxOutGradFunctor<Place, T> maxout_backward;
       maxout_backward(context.device_context(), *in_x, in_x_grad, *out,
-        *out_grad, groups);
+                      *out_grad, groups);
     }
   }
 };
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index d8c58618cf703d086d3cabc927ebc5eb038b1aec..e26ffd86e5b5645e361070ca9fd9d8dc49d1ed30 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -105,7 +105,7 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
   // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector<int>, defalut {0,0}), paddings(height, width) of pooling "
+      "(vector<int>, default {0,0}), paddings(height, width) of pooling "
       "operator."
       "If global_pooling = true, paddings and ksize will be ignored.")
       .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -122,15 +122,15 @@ Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
-Example:
+Example:   
   Input:
        X shape: $(N, C, H_{in}, W_{in})$
   Output:
        Out shape: $(N, C, H_{out}, W_{out})$
-  where 
+  Where
        $$ 
-       H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
-       W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
        $$
 
 )DOC");
@@ -177,7 +177,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
                                // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector<int>, defalut {0,0,0}), paddings(depth, height, "
+      "(vector<int>, default {0,0,0}), paddings(depth, height, "
       "width) of pooling operator. "
       "If global_pooling = true, ksize and paddings will be ignored.")
       .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -199,12 +199,12 @@ Example:
        X shape: $(N, C, D_{in}, H_{in}, W_{in})$
   Output:
        Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
-  where
-       $$
-       D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
-       H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\
-       W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1
-       $$
+  Where
+  $$
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
+  $$
 
 )DOC");
 }
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index 4958fa645405db0798f37165030eae95da371477..b9c42a69128a26ff5942748e11fb87c57d3e3f58 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -142,7 +142,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     // TypedAttrChecker don't support vector type.)
     AddAttr<std::vector<int>>(
         "paddings",
-        "(vector<int>, defalut:{0, 0}), paddings(height, width) of pooling "
+        "(vector<int>, default:{0, 0}), paddings(height, width) of pooling "
         "operator. "
         "If global_pooling = true, paddings and will be ignored.")
         .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -166,10 +166,10 @@ Example:
   Output:
        Out shape: $(N, C, H_{out}, W_{out})$
        Mask shape: $(N, C, H_{out}, W_{out})$
-  where
+  Where
        $$
-       H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
-       W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
        $$
 
 )DOC");
@@ -220,7 +220,7 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     // TypedAttrChecker don't support vector type.)
     AddAttr<std::vector<int>>(
         "paddings",
-        "(vector, defalut {0,0,0}), paddings(depth, "
+        "(vector, default {0,0,0}), paddings(depth, "
         "height, width) of pooling operator. "
         "If global_pooling = true, paddings and ksize will be ignored.")
         .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -244,11 +244,11 @@ Example:
   Output:
        Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
        Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$
-  where
+  Where
        $$
-       D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
-       H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\
-       W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
        $$
 
 )DOC");
diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index 061e82412ea5f4f17fd26a7094e68b97138cc09c..912f88f455252effbdb12ecfc45e4afefa60e03e 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
@@ -35,9 +35,10 @@ class RankLossOp : public framework::OperatorWithKernel {
     auto right_dims = ctx->GetInputDim("Right");
 
     PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims),
-                   "All inputs must have the same size");
-    PADDLE_ENFORCE((label_dims.size() == 2) && (label_dims[1] == 1),
-                   "All inputs must be row vector with size batch_size x 1.");
+                   "All inputs must have the same size.");
+    PADDLE_ENFORCE(
+        (label_dims.size() == 2) && (label_dims[1] == 1),
+        "All inputs must be 2-D tensors with shape [batch_size x 1].");
     ctx->SetOutputDim("Out", label_dims);
   }
 };
@@ -48,10 +49,17 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
                   framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Label",
-             "The label indicating A ranked higher than B or not, row vector.");
-    AddInput("Left", "The output of RankNet for doc A, vector.");
-    AddInput("Right", "The output of RankNet for doc B, vetor.");
-    AddOutput("Out", "The output loss of RankLoss operator, vector.");
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The label indicating A ranked higher than B or not.");
+    AddInput("Left",
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The output of RankNet for doc A.");
+    AddInput("Right",
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The output of RankNet for doc B.");
+    AddOutput("Out",
+              "(2-D Tensor with shape [batch_size x 1]) "
+              "The output loss of RankLoss operator.");
     AddComment(R"DOC(
 RankLoss Operator.
 
@@ -65,16 +73,17 @@ P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
 the input pair.
 
 The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
-(P_{i,j}), which represent the output of RankNet for the two docs and the label, 
-respectively, and yields the rank loss C_{i,j} using the following equation:
+(P_{i,j}), which represent the output score of RankNet for the two docs and 
+the label respectively, and yields the rank loss C_{i,j} using the following 
+equation:
 
-\f$$
-  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
+$$
+  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
   o_{i,j} =  o_i - o_j  \\
   \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
-\f$$
+$$
 
-The operator can take inputs of one sample or in batch.
+The operator can take batch inputs with size batch_size (batch_size >= 1).
 
 )DOC");
   }
diff --git a/paddle/operators/rank_loss_op.cu b/paddle/operators/rank_loss_op.cu
index 779588ff36c792b8925a535d60f1cfbbe3c66d86..5382e3a6296acd257211104d8ec6835c11b90bdd 100644
--- a/paddle/operators/rank_loss_op.cu
+++ b/paddle/operators/rank_loss_op.cu
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h
index f184d6efcb496a1d7f38540712b6c431f816482e..703c77a0b21f2b2f0b0ae6fae86aae819ea824b5 100644
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c69e416e10f2a9ced1f1b22c39235e4c9338e77c
--- /dev/null
+++ b/paddle/operators/recv_op.cc
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <ostream>
+#include <thread>
+
+#include <unistd.h>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/executor.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/send_recv_impl.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+namespace paddle {
+namespace operators {
+
+void RunServer(Server **rpc_server,
+               std::shared_ptr<detail::SendRecvServerImpl> service,
+               const std::string &server_address) {
+  ServerBuilder builder;
+  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
+  builder.RegisterService(service.get());
+  std::unique_ptr<Server> server(builder.BuildAndStart());
+  *rpc_server = server.get();
+  LOG(INFO) << "Server listening on " << server_address << std::endl;
+  server->Wait();
+}
+
+class RecvOp : public framework::OperatorBase {
+ public:
+  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    if (!rpc_service_) {
+      rpc_service_.reset(new detail::SendRecvServerImpl());
+      std::string endpoint = Attr<std::string>("endpoint");
+      server_thread_.reset(
+          new std::thread(RunServer, &rpc_server_, rpc_service_, endpoint));
+    }
+  }
+
+  virtual ~RecvOp() {
+    rpc_server_->Shutdown();
+    server_thread_->join();
+  }
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    // blocking get one var from client.
+    const framework::LoDTensor &t = rpc_service_->Get();
+    framework::Scope &recv_scope = scope.NewScope();
+    // set graph input var
+    auto *var = recv_scope.Var(Input("RX"));
+    auto *tensor = var->GetMutable<framework::LoDTensor>();
+    // FIXME(typhoonzero): do not copy
+    framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor);
+
+    auto *block = Attr<framework::BlockDescBind *>("OptimizeBlock");
+    auto *program = block->Program();
+    framework::Executor executor(dev_ctx);
+    // Run sub graph to get optimized tensor
+    executor.Run(*program, &recv_scope, block->ID(),
+                 false /*create_local_scope*/);
+
+    auto *out_var = recv_scope.FindVar("Out");
+    // push back
+    rpc_service_->Push(out_var->Get<framework::LoDTensor>());
+  }
+
+ protected:
+  // grpc server instance to track status and gracefully shutdown.
+  // borrow an pointer from server thread.
+  Server *rpc_server_{nullptr};
+  // grpc send/recv service implement to register.
+  std::shared_ptr<detail::SendRecvServerImpl> rpc_service_;
+  std::shared_ptr<std::thread> server_thread_;
+};
+
+class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RecvOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("RX", "(Tensor) Input tensor to be saved");
+    AddComment(R"DOC(
+Recv operator
+
+This operator will recv tensor from send_op
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+    AddAttr<framework::BlockDescBind *>("OptimizeBlock", "type BlockDescBind*",
+                                        "optimize network run in server");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker);
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index ba774ec2160c0460867de42f7ad9d5cd65ad8d6a..39bf2118d603881531bf583ae468e8dc9b8bd181 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -1,11 +1,10 @@
-
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
@@ -38,8 +37,8 @@ class ReshapeOp : public framework::OperatorWithKernel {
     // TODO(qiao) change batch_size
     for (size_t i = 1; i < shape.size(); ++i) {
       PADDLE_ENFORCE(shape[i] > 0,
-                     "Each dimension of shape "
-                     "must be positiv except the first.");
+                     "Each dimension of Attr(shape) "
+                     "must be positive except the first one.");
     }
     if (shape[0] < 0) {
       shape[0] = x_dims[0];
diff --git a/paddle/operators/reshape_op.cu.cc b/paddle/operators/reshape_op.cu
similarity index 94%
rename from paddle/operators/reshape_op.cu.cc
rename to paddle/operators/reshape_op.cu
index 23dbe089d3b37aabedf9ef166f7bbfbf67da7e0a..dca6c15007a64808248443af32141b4a677f95d7 100644
--- a/paddle/operators/reshape_op.cu.cc
+++ b/paddle/operators/reshape_op.cu
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h
index 0e98c8b4f443f88ecba044f2f79228227695e182..73fd1da6428f55976a397b7f6f92bb0c796bfe02 100644
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc
old mode 100755
new mode 100644
index 156db9358689c90293311b8f08a7576b680c9472..2b5e66c96b726a3c1fdb2596a244c5395db85279
--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
@@ -43,8 +43,8 @@ class ROIPoolOp : public framework::OperatorWithKernel {
                    "ROIs should be a 2-D tensor of shape (num_rois, 5)"
                    "given as [[batch_id, x1, y1, x2, y2], …].");
     PADDLE_ENFORCE(rois_dims[1] == kROISize,
-                "ROIs should be a 2-D tensor of shape (num_rois, 5)"
-                "given as [[batch_id, x1, y1, x2, y2], …].");
+                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
+                   "given as [[batch_id, x1, y1, x2, y2], …].");
 
     int pooled_height = ctx->Attrs().Get<int>("pooled_height");
     int pooled_width = ctx->Attrs().Get<int>("pooled_width");
@@ -65,7 +65,7 @@ class ROIPoolOp : public framework::OperatorWithKernel {
 
     ctx->SetOutputDim("Out", out_dims);
     ctx->SetOutputDim("Argmax", out_dims);
-    }
+  }
 
  protected:
   framework::OpKernelType GetKernelType(
@@ -100,7 +100,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel {
 class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ROIPoolOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+                 framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor), "
@@ -125,21 +125,22 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
               "(Tensor), "
               "Argmaxes corresponding to indices in X used "
               "for gradient computation. Only output "
-              "if arg “is_test” is false.").AsIntermediate();
+              "if arg “is_test” is false.")
+        .AsIntermediate();
     AddAttr<float>("spatial_scale",
                    "(float, default 1.0), "
                    "Multiplicative spatial scale factor "
                    "to translate ROI coords from their input scale "
                    "to the scale used when pooling.")
-                   .SetDefault(1.0);
+        .SetDefault(1.0);
     AddAttr<int>("pooled_height",
                  "(int, default 1), "
                  "The pooled output height.")
-                 .SetDefault(1);
+        .SetDefault(1);
     AddAttr<int>("pooled_width",
                  "(int, default 1), "
                  "The pooled output width.")
-                 .SetDefault(1);
+        .SetDefault(1);
     AddComment(R"DOC(
 ROIPool operator
 
@@ -153,11 +154,10 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
-            roi_pool_grad, ops::ROIPoolGradOp);
+REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
+            ops::ROIPoolGradOp);
 REGISTER_OP_CPU_KERNEL(
-    roi_pool,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, float>,
+    roi_pool, ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, float>,
     ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
     roi_pool_grad,
diff --git a/paddle/operators/roi_pool_op.cu b/paddle/operators/roi_pool_op.cu
old mode 100755
new mode 100644
index 97df45f1b5779d5e28e36814450a9577edf85135..9a4c8ca752bb7abc4f44d4815743769bc989703a
--- a/paddle/operators/roi_pool_op.cu
+++ b/paddle/operators/roi_pool_op.cu
@@ -29,101 +29,95 @@ static inline int NumBlocks(const int N) {
                   kNumMaxinumNumBlocks);
 }
 
-  template <typename T>
-  __global__ void GPUROIPoolForward(
-      const int nthreads, const T* input_data, const int64_t* input_rois,
-      const float spatial_scale, const int channels, const int height,
-      const int width, const int pooled_height, const int pooled_width,
-      T* output_data, int64_t* argmax_data) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int offset = blockDim.x * gridDim.x;
-    for (size_t i = index; i < nthreads; i += offset) {
-      int pw = index % pooled_width;
-      int ph = (index / pooled_width) % pooled_height;
-      int c = (index / pooled_width / pooled_height) % channels;
-      int n = index / pooled_width / pooled_height / channels;
-
-      const int64_t* offset_input_rois = input_rois + n * kROISize;
-      int roi_batch_ind = offset_input_rois[0];
-      int roi_start_w = round(offset_input_rois[1] * spatial_scale);
-      int roi_start_h = round(offset_input_rois[2] * spatial_scale);
-      int roi_end_w = round(offset_input_rois[3] * spatial_scale);
-      int roi_end_h = round(offset_input_rois[4] * spatial_scale);
-
-      int roi_width = max(roi_end_w - roi_start_w + 1, 1);
-      int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-      int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
-      int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
-      int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
-      int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
-
-      hstart = min(max(hstart + roi_start_h, 0), height);
-      hend = min(max(hend + roi_start_h, 0), height);
-      wstart = min(max(wstart + roi_start_w, 0), width);
-      wend = min(max(wend + roi_start_w, 0), width);
-      bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-      T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
-      int maxidx = -1;
-      const T* offset_input_data =
-          input_data + (roi_batch_ind * channels + c) * height * width;
-      for (int h = hstart; h < hend; ++h) {
-        for (int w = wstart; w < wend; ++w) {
-          int input_data_index = h * width + w;
-          if (offset_input_data[input_data_index] > maxval) {
-            maxval = offset_input_data[input_data_index];
-            maxidx = input_data_index;
-          }
+template <typename T>
+__global__ void GPUROIPoolForward(const int nthreads, const T* input_data,
+                                  const int64_t* input_rois,
+                                  const float spatial_scale, const int channels,
+                                  const int height, const int width,
+                                  const int pooled_height,
+                                  const int pooled_width, T* output_data,
+                                  int64_t* argmax_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = offset_input_rois[0];
+    int roi_start_w = round(offset_input_rois[1] * spatial_scale);
+    int roi_start_h = round(offset_input_rois[2] * spatial_scale);
+    int roi_end_w = round(offset_input_rois[3] * spatial_scale);
+    int roi_end_h = round(offset_input_rois[4] * spatial_scale);
+
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
+    int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
+    int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    hstart = min(max(hstart + roi_start_h, 0), height);
+    hend = min(max(hend + roi_start_h, 0), height);
+    wstart = min(max(wstart + roi_start_w, 0), width);
+    wend = min(max(wend + roi_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
+    int maxidx = -1;
+    const T* offset_input_data =
+        input_data + (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int input_data_index = h * width + w;
+        if (offset_input_data[input_data_index] > maxval) {
+          maxval = offset_input_data[input_data_index];
+          maxidx = input_data_index;
         }
       }
-      output_data[index] = maxval;
-      if (argmax_data) {
-        argmax_data[index] = maxidx;
-      }
+    }
+    output_data[index] = maxval;
+    if (argmax_data) {
+      argmax_data[index] = maxidx;
     }
   }
+}
 
 template <typename T>
 __global__ void GPUROIPoolBackward(
-    const int nthreads,
-    const int64_t* input_rois,
-    const T* output_grad,
-    const int64_t* argmax_data,
-    const int num_rois,
-    const float spatial_scale,
-    const int channels,
-    const int height,
-    const int width,
-    const int pooled_height,
-    const int pooled_width,
-    T* input_grad) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int offset = blockDim.x * gridDim.x;
-    for (int i = index; i < nthreads; i += offset) {
-      int pw = index % pooled_width;
-      int ph = (index / pooled_width) % pooled_height;
-      int c = (index / pooled_width / pooled_height) % channels;
-      int n = index / pooled_width / pooled_height / channels;
-
-      const int64_t* offset_input_rois = input_rois + n * kROISize;
-      int roi_batch_ind = offset_input_rois[0];
-      int input_offset = (roi_batch_ind * channels + c) * height * width;
-      int output_offset = (n * channels + c) * pooled_height * pooled_width;
-      const T* offset_output_grad = output_grad + output_offset;
-      T* offset_input_grad = input_grad + input_offset;
-      const int64_t* offset_argmax_data = argmax_data + output_offset;
-
-      int argmax = offset_argmax_data[ph * pooled_width + pw];
-      if (argmax != -1) {
-        platform::CudaAtomicAdd(offset_input_grad + argmax,
+    const int nthreads, const int64_t* input_rois, const T* output_grad,
+    const int64_t* argmax_data, const int num_rois, const float spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, T* input_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = offset_input_rois[0];
+    int input_offset = (roi_batch_ind * channels + c) * height * width;
+    int output_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_output_grad = output_grad + output_offset;
+    T* offset_input_grad = input_grad + input_offset;
+    const int64_t* offset_argmax_data = argmax_data + output_offset;
+
+    int argmax = offset_argmax_data[ph * pooled_width + pw];
+    if (argmax != -1) {
+      platform::CudaAtomicAdd(
+          offset_input_grad + argmax,
           static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
-      }
     }
   }
-
+}
 
 template <typename Place, typename T>
 class GPUROIPoolOpKernel : public framework::OpKernel<T> {
@@ -145,25 +139,18 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
     int width = in_dims[3];
 
     size_t rois_num = rois->dims()[0];
-    if (rois_num== 0) return;
+    if (rois_num == 0) return;
 
     int output_size = out->numel();
     int blocks = NumBlocks(output_size);
     int threads = kNumCUDAThreads;
 
-    GPUROIPoolForward<T>
-      <<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-      output_size,
-      in->data<T>(),
-      rois->data<int64_t>(),
-      spatial_scale,
-      channels,
-      height,
-      width,
-      pooled_height,
-      pooled_width,
-      out->mutable_data<T>(ctx.GetPlace()),
-      argmax->mutable_data<int64_t>(ctx.GetPlace()));
+    GPUROIPoolForward<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale,
+        channels, height, width, pooled_height, pooled_width,
+        out->mutable_data<T>(ctx.GetPlace()),
+        argmax->mutable_data<int64_t>(ctx.GetPlace()));
   }
 };
 
@@ -175,10 +162,8 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
     auto* rois = ctx.Input<Tensor>("ROIs");
     auto* argmax = ctx.Input<Tensor>("Argmax");
 
-    auto* out_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_grad =
-        ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
     auto pooled_height = ctx.Attr<int>("pooled_height");
     auto pooled_width = ctx.Attr<int>("pooled_width");
@@ -199,21 +184,13 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
       int threads = kNumCUDAThreads;
 
       if (output_grad_size > 0) {
-        GPUROIPoolBackward<T>
-          <<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-          output_grad_size,
-          rois->data<int64_t>(),
-          out_grad->data<T>(),
-          argmax->data<int64_t>(),
-          rois_num,
-          spatial_scale,
-          channels,
-          height,
-          width,
-          pooled_height,
-          pooled_width,
-          x_grad->mutable_data<T>(ctx.GetPlace()));
-        }
+        GPUROIPoolBackward<
+            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+            output_grad_size, rois->data<int64_t>(), out_grad->data<T>(),
+            argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
+            width, pooled_height, pooled_width,
+            x_grad->mutable_data<T>(ctx.GetPlace()));
+      }
     }
   }
 };
@@ -223,8 +200,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    roi_pool,
-    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, float>,
+    roi_pool, ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, float>,
     ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
     roi_pool_grad,
diff --git a/paddle/operators/roi_pool_op.h b/paddle/operators/roi_pool_op.h
old mode 100755
new mode 100644
index bd7736d63125f1be57c8af5141208f66d0592adb..3812c66c65457b9d1337690d1a82759aab9a9732
--- a/paddle/operators/roi_pool_op.h
+++ b/paddle/operators/roi_pool_op.h
@@ -133,54 +133,47 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto* rois = ctx.Input<framework::Tensor>("ROIs");
     auto* argmax = ctx.Input<framework::Tensor>("Argmax");
-
     auto* out_grad =
         ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* x_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
 
     auto pooled_height = ctx.Attr<int>("pooled_height");
     auto pooled_width = ctx.Attr<int>("pooled_width");
 
-    if (x_grad) {
-      int channels = in->dims()[1];
-      auto in_stride = framework::stride(in->dims());
-      auto roi_stride = framework::stride(rois->dims());
-
+    if (in_grad) {
       const int64_t* rois_data = rois->data<int64_t>();
-      int rois_num = rois->dims()[0];
-
-      T* x_grad_data = x_grad->mutable_data<T>(ctx.GetPlace());
+      const T* out_grad_data = out_grad->data<T>();
+      const int64_t* argmax_data = argmax->data<int64_t>();
+      T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
       math::SetConstant<Place, T> set_zero;
-      set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
+      set_zero(ctx.device_context(), in_grad, static_cast<T>(0));
 
-      size_t roi_offset = roi_stride[0];
-      size_t batch_offset = in_stride[0];
-      size_t channel_offset = in_stride[1];
+      auto in_stride = framework::stride(in->dims());
+      auto argmax_stride = framework::stride(argmax->dims());
+      auto roi_stride = framework::stride(rois->dims());
+      auto out_stride = framework::stride(out_grad->dims());
 
-      const T* out_grad_data = out_grad->data<T>();
-      size_t pool_channel_offset = pooled_height * pooled_width;
-      const int64_t* argmax_data = argmax->data<int64_t>();
+      int rois_num = rois->dims()[0];
+      int channels = in->dims()[1];
 
-      for (size_t n = 0; n < rois_num; ++n) {
-        size_t roi_batch_idx = rois_data[0];
-        T* batch_grad_data = x_grad_data + batch_offset * roi_batch_idx;
+      for (int n = 0; n < rois_num; ++n) {
+        int roi_batch_idx = rois_data[0];
+        T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
         for (int c = 0; c < channels; ++c) {
           for (int ph = 0; ph < pooled_height; ++ph) {
             for (int pw = 0; pw < pooled_width; ++pw) {
-              size_t pool_index = ph * pooled_width + pw;
-
+              int pool_index = ph * pooled_width + pw;
               if (argmax_data[pool_index] >= 0) {
-                size_t index = static_cast<size_t>(argmax_data[pool_index]);
+                auto index = argmax_data[pool_index];
                 batch_grad_data[index] += out_grad_data[pool_index];
               }
             }
           }
-          batch_grad_data += channel_offset;
-          out_grad_data += pool_channel_offset;
-          argmax_data += pool_channel_offset;
+          batch_grad_data += in_stride[1];
+          out_grad_data += out_stride[1];
+          argmax_data += argmax_stride[1];
         }
-        rois_data += roi_offset;
+        rois_data += roi_stride[0];
       }
     }
   }
diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc
index 56909fb65f44ad00314103e21bee9535fbd59317..d4921cb80c8d78c52ae1887c36819b52621470eb 100644
--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
@@ -88,73 +88,7 @@ class SaveOp : public framework::OperatorBase {
                    "SaveOp only support LoDTensor, %s has wrong type", iname);
 
     auto &tensor = var->Get<framework::LoDTensor>();
-
-    {  // the 1st field, uint32_t version
-      constexpr uint32_t version = 0;
-      fout.write(reinterpret_cast<const char *>(&version), sizeof(version));
-    }
-    {  // the 2nd field, tensor description
-       // int32_t  size
-       // void*    protobuf message
-      framework::TensorDesc desc;
-      desc.set_data_type(framework::ToDataType(tensor.type()));
-      auto dims = framework::vectorize(tensor.dims());
-      auto *pb_dims = desc.mutable_dims();
-      pb_dims->Resize(static_cast<int>(dims.size()), 0);
-      std::copy(dims.begin(), dims.end(), pb_dims->begin());
-      int32_t size = desc.ByteSize();
-      fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-      auto out = desc.SerializeAsString();
-      fout.write(out.data(), size);
-    }
-    {  // the 3rd field, tensor data
-      uint64_t size = tensor.memory_size();
-      auto *data_ptr = tensor.data<void>();
-      PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
-                     "Index overflow when writing tensor");
-      if (platform::is_gpu_place(tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
-        constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
-        std::unique_ptr<char[]> buf(new char[kBufSize]);
-        auto &gpu_dev_ctx =
-            static_cast<const platform::CUDADeviceContext &>(dev_ctx);
-        platform::CPUPlace cpu;
-        uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
-        while (size != 0) {
-          size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-          memory::Copy(cpu, buf.get(),
-                       boost::get<platform::GPUPlace>(tensor.place()),
-                       reinterpret_cast<const void *>(data), size_to_write,
-                       gpu_dev_ctx.stream());
-          gpu_dev_ctx.Wait();
-          fout.write(buf.get(), size_to_write);
-          data += size_to_write;
-          size -= size_to_write;
-        }
-#else
-        PADDLE_THROW("Unexpected branch");
-#endif
-      } else {
-        fout.write(static_cast<const char *>(data_ptr),
-                   static_cast<std::streamsize>(size));
-      }
-    }
-    {  // the 4th field, lod information
-       // uint64_t lod_level
-       // uint64_t lod_level_1 size in byte.
-       // int*     lod_level_1 data
-       // ...
-      auto lod = tensor.lod();
-      uint64_t size = lod.size();
-      fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-
-      for (auto &each : lod) {
-        size = each.size() * sizeof(framework::LoD::value_type::value_type);
-        fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-        fout.write(reinterpret_cast<const char *>(each.data()),
-                   static_cast<std::streamsize>(size));
-      }
-    }
+    framework::SerializeToStream(fout, tensor, dev_ctx);
   }
 };
 
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index 5745580504fb9bda551f21665bff5c65ae82aeb9..e5c10fec4d840c58a74758a65ddfa93421ab4827 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -77,4 +77,6 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
                   ops::ScaleGradMaker);
 REGISTER_OP_CPU_KERNEL(scale,
                        ops::ScaleKernel<paddle::platform::CPUPlace, float>,
-                       ops::ScaleKernel<paddle::platform::CPUPlace, double>);
+                       ops::ScaleKernel<paddle::platform::CPUPlace, double>,
+                       ops::ScaleKernel<paddle::platform::CPUPlace, int>,
+                       ops::ScaleKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/operators/scale_op.cu b/paddle/operators/scale_op.cu
index 820fd4e6855bb192ec3292ea6983d5ecae73b6e6..0d707751598e65bc56bf73a435c10b4acd6d8ed0 100644
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
@@ -16,4 +16,6 @@
 
 REGISTER_OP_GPU_KERNEL(
     scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>,
-    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>);
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>,
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int>,
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int64_t>);
diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a3059847f2d420359b347e3a5d514d8a3829a4e2
--- /dev/null
+++ b/paddle/operators/send_op.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <ostream>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+
+#include "paddle/operators/detail/send_recv_impl.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+namespace paddle {
+namespace operators {
+
+// TODO(typhoonzero): this is a simple implementation which only send
+// one tensor
+class SendOp : public framework::OperatorBase {
+ public:
+  SendOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    // init client when the operator is created at runtime.
+    if (!client_) {
+      std::string endpoint = Attr<std::string>("endpoint");
+      client_.reset(new detail::RPCClient(
+          grpc::CreateChannel(endpoint, grpc::InsecureChannelCredentials())));
+      // TODO(typhoonzero): how to call InitVariables
+    }
+  }
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto iname = Input("X");
+    auto oname = Output("Out");
+    // TODO(typhoonzero): currently it's non-blocking,
+    // should block until server responds.
+    bool ret = client_->SendVariable(scope, iname, oname);
+    if (!ret) {
+      LOG(ERROR) << "send variable error";
+    }
+  }
+
+ protected:
+  std::shared_ptr<detail::RPCClient> client_{nullptr};
+};
+
+class SendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SendOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor to be saved");
+    AddOutput("Out", "(Tensor) Output fetched from server");
+    AddComment(R"DOC(
+Recv operator
+
+This operator will recv tensor from send_op
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send, ops::SendOp, ops::SendOpMaker);
diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ac03eb3752e7cd31dd80f4caa39dc0625f0409d5
--- /dev/null
+++ b/paddle/operators/send_recv_op_test.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+// TODO(typhoonzero): add python bindings for this test as
+// a RemoteOptimizer.
+
+#include <unistd.h>
+#include <thread>
+
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
+
+USE_NO_KERNEL_OP(send);
+USE_NO_KERNEL_OP(recv);
+USE_OP(sum);
+
+// global for simplicity.
+std::unique_ptr<paddle::framework::OperatorBase> recv_op;
+
+void InitTensorsInScope(paddle::framework::Scope &scope,
+                        paddle::platform::CPUPlace &place) {
+  paddle::platform::CPUDeviceContext ctx(place);
+  auto var = scope.Var("X");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({10, 10});
+  float *expect = tensor->mutable_data<float>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<float>(i);
+  }
+
+  auto out_var = scope.Var("Out");
+  auto out_tensor = out_var->GetMutable<paddle::framework::LoDTensor>();
+  out_tensor->Resize({10, 10});
+  tensor->mutable_data<float>(place);  // allocate
+}
+
+void AddOp(const std::string &type,
+           const paddle::framework::VariableNameMap &inputs,
+           const paddle::framework::VariableNameMap &outputs,
+           paddle::framework::AttributeMap attrs,
+           paddle::framework::BlockDescBind *block) {
+  // insert output
+  for (auto kv : outputs) {
+    for (auto v : kv.second) {
+      auto var = block->Var(v);
+      var->SetDataType(paddle::framework::DataType::FP32);
+    }
+  }
+
+  // insert op
+  auto op = block->AppendOp();
+  op->SetType(type);
+  for (auto &kv : inputs) {
+    op->SetInput(kv.first, kv.second);
+  }
+  for (auto &kv : outputs) {
+    op->SetOutput(kv.first, kv.second);
+  }
+  op->SetAttrMap(attrs);
+}
+
+void StartServerNet() {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+  InitTensorsInScope(scope, place);
+
+  // sub program run in recv_op, for simple test we use sum
+  paddle::framework::ProgramDescBind program;
+  paddle::framework::BlockDescBind *block = program.MutableBlock(0);
+  // X for server side tensors, RX for received tensers, must be of same shape.
+  AddOp("sum", {{"X", {"X", "RX"}}}, {{"Out", {"Out"}}}, {}, block);
+
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+  attrs.insert({"OptimizeBlock", block});
+  recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}},
+                                                    {{"Out", {"Out"}}}, attrs);
+  paddle::platform::CPUDeviceContext ctx(place);
+  recv_op->Run(scope, ctx);
+}
+
+TEST(SendRecvOp, CPU) {
+  std::thread server_thread(StartServerNet);
+  sleep(5);  // wait server to start
+  // local net
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+  InitTensorsInScope(scope, place);
+
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+
+  auto send_op = paddle::framework::OpRegistry::CreateOp(
+      "send", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
+  paddle::platform::CPUDeviceContext ctx(place);
+  send_op->Run(scope, ctx);
+
+  auto in_var = scope.Var("X");
+  auto tensor = in_var->GetMutable<paddle::framework::LoDTensor>();
+  float *expected = tensor->data<float>();
+
+  auto out_var = scope.Var("Out");
+  auto target = out_var->GetMutable<paddle::framework::LoDTensor>();
+  // send fail cause output is none.
+  EXPECT_NE(target->memory_size(), size_t(0));
+  float *actual = target->data<float>();
+  for (int64_t i = 0; i < target->numel(); ++i) {
+    EXPECT_EQ(expected[i] * 2, actual[i]);
+  }
+  recv_op.reset();  // dtor can shutdown and join server thread.
+  server_thread.join();
+}
diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc
old mode 100755
new mode 100644
index cbe0b4233160dd1f3ebdf6db8b5f6df392efdfe7..255683a572c0e8d54791cb0c905d85239920d992
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
@@ -45,7 +45,7 @@ class SequenceSliceOp : public framework::OperatorWithKernel {
     // Initialize the output's dims to maximum,
     // and re-set to real dims by the value of Offset and Length at kernel
     ctx->SetOutputDim("Out", input_dims);
-    }
+  }
 
  protected:
   framework::OpKernelType GetKernelType(
@@ -93,8 +93,7 @@ class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor), "
              "a vector<int> to describe the length of every input sequence for "
              "sub sequence item.");
-    AddOutput("Out",
-              "(LoDTensor), the output of SequenceSliceOp.");
+    AddOutput("Out", "(LoDTensor), the output of SequenceSliceOp.");
     AddComment(R"DOC(
 Sequence slice operator
 
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 72f4e4d5cbcd692423fa2a3e9ec8e7033b552c3c..5576d7b8be060a3c58cb18ed667041562cf853b8 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -55,7 +55,7 @@ SGD operator
 
 This operator implements one step of the stochastic gradient descent algorithm.
 
-$$param_out = param - learning_rate * grad$$
+$$param\_out = param - learning\_rate * grad$$
 
 )DOC");
   }
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
index 48597c1d2ace9cb5fe36ba237f70cab8b280a836..c380e606869fd2c559c7d5f378857ca74fa8d8d3 100644
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -57,11 +57,21 @@ class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
   ShrinkRNNMemoryOpProtoMaker(framework::OpProto *proto,
                               framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "");
-    AddInput("RankTable", "");
-    AddInput("I", "");
-    AddOutput("Out", "");
-    AddComment("");
+    AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
+    AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
+    AddInput("I",
+             "(LoDTensor) The step index. The RNN step memory 'X' will be "
+             "shrinked to match the size of the input of the index'th step.");
+    AddOutput("Out", "(LoDTensor) The shrinked RNN step memory.");
+    AddComment(
+        R"DOC(
+        In dynamic RNN, we are able to handle sequences of different lengths. 
+        Because of the multiple lengths, the size of each step input can be 
+        different, which may lead to a mismatching between the input of
+        the current step and the memory generated by the previous one. This 
+        operator shrinks memory according to the size of the next step input, 
+        to make sure that they can match each other.
+        )DOC");
   }
 };
 
diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc
index ebf7b43700a7498aa18b5f648b0b8c2c4e7b442b..50543fcc148698c42e15259ba20bdacdd50ac1af 100644
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -22,22 +22,20 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
 
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same.");
+    PADDLE_ENFORCE_EQ(x_dims, y_dims);
     PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "The tensor rank of X must be at least 2.");
+                      "The tensor rank of Input(X) should not be less than 2.");
     if (ctx->HasInput("InsideWeight")) {
       PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
                      "If weights are provided, must specify both "
                      "inside and outside weights.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims,
-                        "The shape of InsideWeight must be same as X.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims,
-                        "The shape of OutsideWeight must be same as X.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims);
     }
 
     ctx->SetOutputDim("Diff", x_dims);
@@ -53,25 +51,29 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
                       framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "The input tensor of smooth l1 loss op."
-             "The rank should be greater or equal to 2 with shape "
-             "[batch_size, value_dim1, value_dim2, ..., value_dimN]");
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "The input value of smooth l1 loss op with shape "
+             "[batch_size, dim1, ..., dimN].");
     AddInput("Y",
-             "The target tensor of smooth l1 loss op "
-             "with the same shape as X.");
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "The target value of smooth l1 loss op with same shape as X.");
     AddInput("InsideWeight",
-             "Optional input tensor of smooth l1 loss op with the same shape "
-             "as X. If provided, the result of (X - Y) will be multiplied "
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "This input is optional and should have same shape with X. "
+             "If provided, the result of (X - Y) will be multiplied "
              "by this tensor element by element.")
         .AsDispensable();
     AddInput("OutsideWeight",
-             "Optinal input of smooth l1 loss op with the same shape as X."
-             "If provided, the output smooth l1 loss will be multiplied by "
-             "this tensor element by element.")
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "This input is optional and should have same shape with X. "
+             "If provided, the out smooth l1 loss will be multiplied by this "
+             "tensor element by element.")
         .AsDispensable();
-    AddOutput("Diff", "Intermediate variable to cache InsideWeight*(X-Y).")
+    AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).")
         .AsIntermediate();
-    AddOutput("Out", "Smooth l1 loss.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>) A tensor with rank be 2. "
+              "The output smooth l1 loss with shape [batch_size, 1].");
     AddAttr<AttrType>("sigma",
                       "Hyper parameter of smooth l1 loss op."
                       "A float scalar with default value 3.0.")
@@ -79,15 +81,23 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Smooth L1 Loss Operator.
 
-This operator computes the smooth l1 loss for input and target.
-The operator takes the first dimension of input as the batch size.
+This operator computes the smooth l1 loss for X and Y.
+The operator takes the first dimension of X and Y as batch size.
 For each instance, it computes the smooth l1 loss element by element first
-and then sums all the losses. So the resulting output shape
-is [batch_size, 1].
+and then sums all the losses. So the shape of Out is [batch_size, 1].
 
 The equation is:
-loss = $$0.5 * (\sigma * (x-y))^2$$   if $$|x - y| < 1 /({\sigma}^2)$$
-       $$\frac{|x - y| - 0.5}{{\sigma}^2}$$ otherwise
+$$
+Out_{\sigma}(X, Y)_i = \begin{cases}
+0.5 * (\sigma * (X_i - Y_i)) ^ 2
+\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\
+\frac{|X_i - Y_i| - 0.5}{{\sigma}^2},
+\quad otherwise
+\end{cases}
+$$
+
+In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
+element of Out, X and Y.
 
 )DOC");
   }
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89c48e071cf351f7d7b9cf26a5d4989af291da57
--- /dev/null
+++ b/paddle/operators/unpool_op.cc
@@ -0,0 +1,143 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/unpool_op.h"
+namespace paddle {
+namespace operators {
+
+class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Unpool2dOpMaker(framework::OpProto* proto,
+                  framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of unpool operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddInput(
+        "Indices",
+        "(Tensor) The input tensor of the indices given out by MaxPool2d. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of unpool operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "(vector), the unpooling window size(height, width) "
+        "of unpooling operator.");
+    AddAttr<std::vector<int>>("strides",
+                              "(vector, default:{1, 1}), "
+                              "strides (height, width) of unpooling operator.")
+        .SetDefault({1, 1});
+    AddAttr<std::vector<int>>("paddings",
+                              "(vector defalut:{0,0}), "
+                              "paddings (height, width) of unpooling operator.")
+        .SetDefault({0, 0});
+    AddAttr<std::string>(
+        "unpooling_type",
+        "(string), unpooling type, can be \"max\" for max-unpooling ")
+        .InEnum({"max"});
+    AddComment(R"DOC(
+        "Input shape: $(N, C_{in}, H_{in}, W_{in})$
+        Output shape: $(N, C_{out}, H_{out}, W_{out})$
+        Where
+          $$
+            H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
+            W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+          $$
+        Paper: http://www.matthewzeiler.com/wp-content/uploads/2017
+        /07/iccv2011.pdf
+        )DOC");
+  }
+};
+
+int OutputSize(int input_size, int ksize, int padding, int stride) {
+  int output_size = (input_size - 1) * stride - 2 * padding + ksize;
+  return output_size;
+}
+
+class UnpoolOp : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input(Indices) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UnpoolOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    auto in_y_dims = ctx->GetInputDim("Indices");
+    std::string unpooling_type =
+        ctx->Attrs().Get<std::string>("unpooling_type");
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
+                   "Unpooling intput must be of 4-dimensional.");
+    PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(
+          OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class UnpoolOpGrad : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
+            ops::UnpoolOpGrad);
+REGISTER_OP_CPU_KERNEL(unpool,
+                       ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
+                       ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    unpool_grad, ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
+    ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..18aafb7dc74ed474ed3ec5e8a388ecdb71b9a8f5
--- /dev/null
+++ b/paddle/operators/unpool_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/unpool_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(unpool,
+                       ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
+                       ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    unpool_grad, ops::UnpoolGradKernel<paddle::platform::GPUPlace, float>,
+    ops::UnpoolGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..243eb7e532c5149db4fb1b381fd8664ae4bdd81a
--- /dev/null
+++ b/paddle/operators/unpool_op.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/unpooling.h"
+
+namespace paddle {
+namespace operators {
+template <typename Place, typename T>
+class UnpoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    auto* out = context.Output<framework::Tensor>("Out");
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    T* output_data = out->mutable_data<T>(context.GetPlace());
+    if (output_data) {
+      math::SetConstant<Place, T> set_zero;
+      set_zero(context.device_context(), out, static_cast<T>(0));
+    }
+    math::Unpool2dMaxFunctor<Place, T> unpool2d_max_forward;
+    unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
+  }
+};
+template <typename Place, typename T>
+class UnpoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    auto& device_ctx = context.device_context();
+    math::SetConstant<Place, T> zero;
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, in_x_grad, static_cast<T>(0));
+    }
+    math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
+    unpool2d_max_backward(context.device_context(), *in_x, *in_y, *out,
+                          *out_grad, in_x_grad);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/platform/cuda_profiler.h b/paddle/platform/cuda_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6311cb23d695c3cd851bcca120c24cced7fdd62
--- /dev/null
+++ b/paddle/platform/cuda_profiler.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cuda_profiler_api.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace paddle {
+namespace platform {
+
+void CudaProfilerInit(std::string output_file, std::string output_mode,
+                      std::vector<std::string> config_flags) {
+  std::array<char, 128> buf;
+  std::string tmpl = "/tmp/cuda_profile_config.XXXXXX";
+  PADDLE_ENFORCE_LT(tmpl.size(), buf.size());
+  memcpy(buf.data(), tmpl.data(), tmpl.size());
+  auto result = mktemp(buf.data());
+  PADDLE_ENFORCE(strlen(result) != 0);
+  std::string config_file = result;
+
+  {
+    std::ofstream ofs(config_file, std::ios::out | std::ios::trunc);
+    PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate());
+    for (const auto& line : config_flags) {
+      ofs << line << std::endl;
+    }
+  }
+
+  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
+  cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
+  PADDLE_ENFORCE(
+      cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
+}
+
+void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); }
+
+void CudaProfilerStop() { PADDLE_ENFORCE(cudaProfilerStop()); }
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc
index d3e4cb567d71b987724366b6a0896f5df0eb6055..761d9edd87f428ba140d29a566fc3401199bab15 100644
--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/platform/dynload/cudnn.cc
@@ -37,6 +37,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_R7
+CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h
index b2d69da93bcd4a5c8e694a18ca648ddc4bd947af..61caac545014db2a09e2ada0b508419578c49740 100644
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
@@ -135,6 +135,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
+#if CUDNN_VERSION >= 7001
+#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
+  __macro(cudnnSetConvolutionGroupCount);
+CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index f55a1edce31ccf2498dcfcf0b30ba1012d7a7d1a..c16d3e0cbe01f90a5aa9a5d7a523cd4e282e4771 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -37,6 +37,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/operators/nccl/nccl_gpu_common.h"
+#include "paddle/platform/cuda_profiler.h"
 #include "paddle/platform/gpu_info.h"
 #endif
 
@@ -460,6 +461,10 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("op_support_gpu", OpSupportGPU);
 #ifdef PADDLE_WITH_CUDA
   m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
+
+  m.def("nvprof_init", platform::CudaProfilerInit);
+  m.def("nvprof_start", platform::CudaProfilerStart);
+  m.def("nvprof_stop", platform::CudaProfilerStop);
 #endif
 
   return m.ptr();
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index fda2a2f1b764106a7a108e8c56bc90ce3459e9b5..502637c881208e53dd832a9759b3873ef1988395 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -16,11 +16,13 @@ function cmake_gen() {
         echo "using python abi: $1"
         if [ "$1" == "cp27-cp27m" ]; then
             export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
+            export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
             PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
         -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
         -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
         elif [ "$1" == "cp27-cp27mu" ]; then
             export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
+            export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
             PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
         -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
         -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
@@ -181,6 +183,7 @@ EOF
     ${DOCKERFILE_GPU_ENV}
     ADD go/cmd/pserver/pserver /usr/bin/
     ADD go/cmd/master/master /usr/bin/
+    ADD paddle/pybind/print_operators_doc /usr/bin/
     # default command shows the paddle version and exit
     CMD ["paddle", "version"]
 EOF
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index 28d82343ed32273740d0c52d0451681e43b3675e..7d54f0254c8ea9367a34233602293db5b8593f9a 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -11,8 +11,9 @@ make -j `nproc` gen_proto_py
 make -j `nproc` paddle_docs paddle_docs_cn
 
 # check websites for broken links
-linkchecker doc/en/html/index.html
-linkchecker doc/cn/html/index.html
+# It will be failed now!
+#linkchecker doc/en/html/index.html
+#linkchecker doc/cn/html/index.html
 
 # Parse Github URL
 REPO=`git config remote.origin.url`
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index e2f5592248fd0b6166c2d11af02cef7815673def..2fcdbbc8bd671f8ae911cf82c7a91091f252a82f 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -544,6 +544,9 @@ message LayerConfig {
   // for batch normalization layer
   // The small constant added to the variance to improve numeric stability.
   optional double epsilon = 60 [ default = 0.00001 ];
+
+  // for factorization machine layer
+  optional uint32 factor_size = 61;
 }
 
 message EvaluatorConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 5ba0e50c6ba0f84a3ea87d5a5199fef23a5b05ea..5b173694dd0e4a52c0179f12f5edd74e2c41cb8c 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2400,6 +2400,14 @@ class CropLayer(LayerBase):
         image_conf.img_size_y = input_layer.height
         image_conf.channels = input_layer.size / (input_layer.width *
                                                   input_layer.height)
+        # only support for 4-dims inputs and NCHW order
+        if (len(self.config.inputs) == 2):
+            self.set_layer_height_width(
+                self.get_input_layer(1).height, self.get_input_layer(1).width)
+            self.set_layer_size(self.get_input_layer(1).size)
+        else:
+            self.set_layer_height_width(shape[-2], shape[-1])
+            self.set_layer_size(reduce(lambda x, y: x * y, shape[1:]))
 
 
 @config_layer('batch_norm')
@@ -2798,19 +2806,18 @@ class AddToLayer(LayerBase):
             name, self.layer_type, 0, inputs=inputs, **xargs)
         config_assert(len(inputs) > 0, 'inputs cannot be empty for AddToLayer')
 
-        if len(self.inputs) > 1:
-            for input_index in xrange(len(self.inputs)):
-                assert self.get_input_layer(0).height == self.get_input_layer(
-                    input_index).height
-                assert self.get_input_layer(0).width == self.get_input_layer(
-                    input_index).width
-                assert self.get_input_layer(0).depth == self.get_input_layer(
-                    input_index).depth
+        layer_size = self.get_input_layer(0).size
+        # To reserve heght, width, depth.
+        layer_with_hwc = self.get_input_layer(0)
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            assert layer_size == input_layer.size
+            if input_layer.height and input_layer.height and input_layer.height:
+                layer_with_hwc = input_layer
 
-        self.set_layer_size(self.get_input_layer(0).size)
-        self.set_layer_height_width(self.get_input_layer(0).height, \
-                                        self.get_input_layer(0).width)
-        self.set_layer_depth(self.get_input_layer(0).depth)
+        self.set_layer_size(layer_with_hwc.size)
+        self.set_layer_height_width(layer_with_hwc.height, layer_with_hwc.width)
+        self.set_layer_depth(layer_with_hwc.depth)
         self.create_bias_parameter(bias, self.config.size)
 
 
@@ -3850,6 +3857,26 @@ class SwitchOrderLayer(LayerBase):
             name, 'switch_order', 0, inputs=inputs, **xargs)
         self.config.reshape_conf.height_axis.extend(reshape['height'])
         self.config.reshape_conf.width_axis.extend(reshape['width'])
+        input_layer = self.get_input_layer(0)
+        if reshape is None:
+            self.set_layer_size(input_layer.size)
+        else:
+            in_h = input_layer.height
+            in_w = input_layer.width
+            out_dims = None
+            if input_layer.has_depth():
+                in_d = input_layer.depth
+                in_c = input_layer.size / in_h / in_w / in_d
+                # batch_size, depth, height, width, channel
+                out_dims = [0, in_d, in_h, in_w, in_c]
+            else:
+                in_c = input_layer.size / in_h / in_w
+                # batch_size, height, width, channel
+                out_dims = [0, in_h, in_w, in_c]
+            # Because (reshape['width'][0] > 0) always be true.
+            # So out_dims[0] won't be used.
+            size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:])
+            self.set_layer_size(size)
 
 
 @config_layer('scale_sub_region')
@@ -3871,6 +3898,21 @@ class ScaleSubRegionLayer(LayerBase):
                            image_conf.channels)
 
 
+@config_layer('factorization_machine')
+class FactorizationMachineLayer(LayerBase):
+    def __init__(self, name, inputs, factor_size, **xargs):
+        super(FactorizationMachineLayer, self).__init__(
+            name, 'factorization_machine', size=1, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'factorization machine layer must have one and only one input.')
+        self.config.factor_size = factor_size
+        input_layer = self.get_input_layer(0)
+        psize = input_layer.size * factor_size
+        dims = [input_layer.size, factor_size]
+        self.create_input_parameter(0, psize, dims)
+
+
 # Deprecated, use a new layer specific class instead
 @config_func
 def Layer(name, type, **xargs):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 8e127c9489ca5a4ed190e6d4e12ec4c9b28ad9cf..f6dc58b9c0ed0b14ad9db098892af14274aed0c1 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -148,6 +148,7 @@ __all__ = [
     'resize_layer',
     'sub_seq_layer',
     'scale_sub_region_layer',
+    'factorization_machine',
 ]
 
 
@@ -264,6 +265,8 @@ class LayerType(object):
 
     SCALE_SUB_REGION_LAYER = 'scale_sub_region'
 
+    FACTORIZATION_MACHINE = 'factorization_machine'
+
     @staticmethod
     def is_layer_type(type_name):
         """
@@ -1900,9 +1903,12 @@ def repeat_layer(input,
     A layer for repeating the input for num_repeats times.
 
     If as_row_vector:
+
     .. math::
        y  = [x_1,\cdots, x_n, \cdots, x_1, \cdots, x_n]
+
     If not as_row_vector:
+
     .. math::
        y  = [x_1,\cdots, x_1, \cdots, x_n, \cdots, x_n]
 
@@ -1915,19 +1921,19 @@ def repeat_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param num_repeats: Repeat the input so many times
+    :param num_repeats: The times of repeating the input.
     :type num_repeats: int
     :param name: The name of this layer. It is optional.
-    :param as_row_vector: True for treating input as row vector and repeating
-                          in the column direction.  This is equivalent to apply
-                          concat_layer() with num_repeats same input.
-                          False for treating input as column vector and repeating
-                          in the row direction.
+    :type name: basestring
+    :param as_row_vector: Whether to treat the input as row vectors or not. If
+                          the parameter is set to True, the repeating operation
+                          will be performed in the column direction. Otherwise,
+                          it will be performed in the row direction.
     :type as_row_vector: bool
     :param act: Activation type. IdentityActivation is the default activation.
     :type act: BaseActivation
-    :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1974,13 +1980,14 @@ def seq_reshape_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param reshape_size: the size of reshaped sequence.
+    :param reshape_size: The dimension of the reshaped sequence.
     :type reshape_size: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param act: Activation type. IdentityActivation is the default activation.
     :type act: BaseActivation
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
@@ -2008,7 +2015,7 @@ def seq_reshape_layer(input,
 @layer_support()
 def interpolation_layer(input, weight, name=None, layer_attr=None):
     """
-    This layer is for linear interpolation with two inputs,
+    This layer performs linear interpolation on two inputs,
     which is used in NEURAL TURING MACHINE.
 
     .. math::
@@ -2030,7 +2037,8 @@ def interpolation_layer(input, weight, name=None, layer_attr=None):
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2064,7 +2072,7 @@ def bilinear_interp_layer(input,
                           name=None,
                           layer_attr=None):
     """
-    This layer is to implement bilinear interpolation on conv layer output.
+    This layer implements bilinear interpolation on convolutional layer's output.
 
     Please refer to Wikipedia: https://en.wikipedia.org/wiki/Bilinear_interpolation
 
@@ -2074,18 +2082,19 @@ def bilinear_interp_layer(input,
 
        bilinear = bilinear_interp_layer(input=layer1, out_size_x=64, out_size_y=64)
 
-    :param   input:        A input layer.
-    :type    input:        LayerOutput.
-    :param   out_size_x:   bilinear interpolation output width.
-    :type    out_size_x:   int | None
-    :param   out_size_y:   bilinear interpolation output height.
-    :type    out_size_y:   int | None
-    :param   name:         The layer's name, which cna not be specified.
-    :type    name:         None | basestring
-    :param   layer_attr:   Extra Layer attribute.
-    :type    layer_attr:   ExtraLayerAttribute
+    :param input: The input of this layer.
+    :type input: LayerOutput.
+    :param out_size_x: The width of the output.
+    :type out_size_x: int
+    :param out_size_y: The height of the output.
+    :type out_size_y: int
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
-    :rtype:  LayerOutput
+    :rtype: LayerOutput
     """
     assert input.layer_type == LayerType.CONV_LAYER
     assert isinstance(input.activation, LinearActivation)
@@ -2120,8 +2129,8 @@ def power_layer(input, weight, name=None, layer_attr=None):
     .. math::
        y = x^w
 
-    where :math:`x` is a input vector, :math:`w` is scalar weight,
-    and :math:`y` is a output vector.
+    where :math:`x` is an input vector, :math:`w` is a scalar exponent,
+    and :math:`y` is an output vector.
 
     The example usage is:
 
@@ -2131,11 +2140,12 @@ def power_layer(input, weight, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param weight: Weight layer.
+    :param weight: The exponent of the power.
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2175,11 +2185,12 @@ def scaling_layer(input, weight, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param weight: Weight layer.
+    :param weight: The weight of each sample.
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2217,7 +2228,8 @@ def trans_layer(input, name=None, layer_attr=None):
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2253,11 +2265,14 @@ def rotate_layer(input, height, width, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param height: The height of the sample matrix
+    :param height: The height of the sample matrix.
     :type height: int
+    :param width: The width of the sample matrix.
+    :type width: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2302,15 +2317,15 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param a: input layer a
+    :param a: The first input of this layer.
     :type a: LayerOutput
-    :param b: input layer b
+    :param b: The second input of this layer.
     :type b: LayerOutput
-    :param scale: scale for cosine value. default is 5.
+    :param scale: The scale of the cosine similarity. 1 is the default value.
     :type scale: float
-    :param size: layer size. NOTE size_a * size should equal size_b.
+    :param size: The dimension of this layer. NOTE size_a * size should equal size_b.
     :type size: int
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2395,8 +2410,10 @@ def hsigmoid(input,
     """
     Organize the classes into a binary tree. At each node, a sigmoid function
     is used to calculate the probability of belonging to the right branch.
-    This idea is from "F. Morin, Y. Bengio (AISTATS 05):
-    Hierarchical Probabilistic Neural Network Language Model."
+
+    Reference:
+        `Hierarchical Probabilistic Neural Network Language Model
+        <http://www.gatsby.ucl.ac.uk/aistats/fullpapers/208.pdf>`_
 
     The example usage is:
 
@@ -2407,19 +2424,21 @@ def hsigmoid(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput | list | tuple
-    :param label: Label layer.
+    :param label: The input label.
     :type label: LayerOutput
-    :param num_classes: number of classes.
-    :type num_classes: int | None
+    :param num_classes: The number of classes. And it should be larger than 2. If the parameter
+                        is not set or set to None, its actual value will be automatically set to
+                        the number of labels.
+    :type num_classes: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: Parameter Attribute. None means default parameter.
-    :type param_attr: ParameterAttribute | None
-    :param layer_attr: Extra Layer Attribute.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2969,8 +2988,8 @@ def spp_layer(input,
     A layer performs spatial pyramid pooling.
 
     Reference:
-        Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
-        https://arxiv.org/abs/1406.4729
+        `Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
+        https://arxiv.org/abs/1406.4729`_
 
     The example usage is:
 
@@ -3071,8 +3090,8 @@ def img_cmrnorm_layer(input,
     Response normalization across feature maps.
 
     Reference:
-        ImageNet Classification with Deep Convolutional Neural Networks
-        http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf
+        `ImageNet Classification with Deep Convolutional Neural Networks
+        http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf`_
 
     The example usage is:
 
@@ -3138,9 +3157,9 @@ def batch_norm_layer(input,
         y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
 
     Reference:
-        Batch Normalization: Accelerating Deep Network Training by Reducing
+        `Batch Normalization: Accelerating Deep Network Training by Reducing
         Internal Covariate Shift
-        http://arxiv.org/abs/1502.03167
+        http://arxiv.org/abs/1502.03167`_
 
     The example usage is:
 
@@ -4241,7 +4260,7 @@ def dot_prod_layer(input1, input2, name=None, layer_attr=None):
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input1: The first input layer.
-    :type input: LayerOutput
+    :type input1: LayerOutput
     :param input2: The second input layer.
     :type input2: LayerOutput
     :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
@@ -5397,10 +5416,10 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
     to be devided by groups.
 
     Reference:
-        Maxout Networks
-        http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
-        Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
-        https://arxiv.org/pdf/1312.6082v4.pdf
+        `Maxout Networks
+        http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf`_
+        `Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
+        https://arxiv.org/pdf/1312.6082v4.pdf`_
 
     .. math::
        y_{si+j} = \max_k x_{gsi + sk + j}
@@ -5465,9 +5484,9 @@ def ctc_layer(input,
     alignment between the inputs and the target labels is unknown.
 
     Reference:
-        Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
+        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
         with Recurrent Neural Networks
-        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
+        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
 
     Note:
         Considering the 'blank' label needed by CTC, you need to use (num_classes + 1)
@@ -5539,9 +5558,9 @@ def warp_ctc_layer(input,
     install it to :code:`third_party/install/warpctc` directory.
 
     Reference:
-        Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
+        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
         with Recurrent Neural Networks
-        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
+        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
 
     Note:
         - Let num_classes represents the category number. Considering the 'blank'
@@ -5761,8 +5780,8 @@ def nce_layer(input,
     Noise-contrastive estimation.
 
     Reference:
-        A fast and simple algorithm for training neural probabilistic language
-        models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf
+        `A fast and simple algorithm for training neural probabilistic language
+        models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf`_
 
     The example usage is:
 
@@ -5877,8 +5896,8 @@ def rank_cost(left,
     A cost Layer for learning to rank using gradient descent.
 
     Reference:
-        Learning to Rank using Gradient Descent
-        http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf
+        `Learning to Rank using Gradient Descent
+        http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf`_
 
     .. math::
 
@@ -6413,8 +6432,8 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
         smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if}  \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases}
 
     Reference:
-        Fast R-CNN
-        https://arxiv.org/pdf/1504.08083v2.pdf
+        `Fast R-CNN
+        https://arxiv.org/pdf/1504.08083v2.pdf`_
 
     The example usage is:
 
@@ -6620,8 +6639,8 @@ def prelu_layer(input,
     The Parametric Relu activation that actives outputs with a learnable weight.
 
     Reference:
-        Delving Deep into Rectifiers: Surpassing Human-Level Performance on
-        ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf
+        `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+        ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf`_
 
     .. math::
        z_i &\\quad if \\quad z_i > 0 \\\\
@@ -6717,8 +6736,8 @@ def gated_unit_layer(input,
     product between :match:`X'` and :math:`\sigma` is finally returned.
 
     Reference:
-        Language Modeling with Gated Convolutional Networks
-        https://arxiv.org/abs/1612.08083
+        `Language Modeling with Gated Convolutional Networks
+        https://arxiv.org/abs/1612.08083`_
 
     .. math::
        y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
@@ -6854,6 +6873,7 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
 
     :param input: The input of this layer. If two inputs are given, the second one
                   will be regarded as the reference.
+                  And the input must be 4-dims and in NCHW order.
     :type input: LayerOutput | Sequence
     :param offset: The crop offset.
     :type offset: Sequence
@@ -7387,3 +7407,73 @@ def scale_sub_region_layer(input, indices, value, name=None):
         parents=[input, indices],
         num_filters=input.num_filters,
         size=input.size)
+
+
+@wrap_name_default()
+@wrap_act_default(act=LinearActivation())
+@wrap_param_attr_default()
+@layer_support()
+def factorization_machine(input,
+                          factor_size,
+                          act=None,
+                          name=None,
+                          param_attr=None,
+                          layer_attr=None):
+    """
+    The Factorization Machine models pairwise feature interactions as inner
+    product of the learned latent vectors corresponding to each input feature.
+    The Factorization Machine can effectively capture feature interactions
+    especially when the input is sparse.
+
+    This implementation only consider the 2-order feature interactions using
+    Factorization Machine with the formula:
+
+    .. math::
+        y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+
+    Note:
+        X is the input vector with size n. V is the factor matrix. Each row of V
+        is the latent vector corresponding to each input dimesion. The size of
+        each latent vector is k.
+
+    For details of Factorization Machine, please refer to the paper:
+    Factorization machines.
+
+    .. code-block:: python
+        first_order = paddle.layer.fc(input=input,
+                                      size=1,
+                                      act=paddle.activation.Linear())
+        second_order = paddle.layer.factorization_machine(input=input,
+                                                          factor_size=10)
+        fm = paddle.layer.addto(input=[first_order, second_order],
+                                act=paddle.activation.Linear(),
+                                bias_attr=False)
+
+    :param input: The input layer. Supported input types: all input data types
+                  on CPU, and only dense input types on GPU.
+    :type input: LayerOutput
+    :param factor_size: The hyperparameter that defines the dimensionality of
+                        the latent vector size.
+    :type context_len: int
+    :param act: Activation Type. Default is linear activation.
+    :type act: BaseActivation
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert factor_size > 0, "the factor_size must be greater than 0."
+
+    Layer(
+        inputs=[Input(input.name, **param_attr.attr)],
+        name=name,
+        factor_size=factor_size,
+        type=LayerType.FACTORIZATION_MACHINE,
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.FACTORIZATION_MACHINE, input, activation=act, size=1)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index a21f67a2d99e7eab39708e2a571d30d7e9f20ce6..10c941f707498ec45e79bed9d3f8054eea19887d 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -11,6 +11,7 @@ test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_l
 test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
 test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer
 test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer
-test_scale_sub_region_layer test_dot_prod_layer test_l2_distance_layer)
+test_scale_sub_region_layer test_dot_prod_layer test_l2_distance_layer
+test_factorization_machine)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..4f3002b19942ed58970bfd64e5978c1601273992
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
@@ -0,0 +1,39 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 1024
+  active_type: ""
+}
+layers {
+  name: "__factorization_machine_0__"
+  type: "factorization_machine"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___factorization_machine_0__.w0"
+  }
+  factor_size: 10
+}
+parameters {
+  name: "___factorization_machine_0__.w0"
+  size: 10240
+  initial_mean: 0.0
+  initial_std: 0.03125
+  dims: 1024
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data"
+output_layer_names: "__factorization_machine_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__factorization_machine_0__"
+  input_layer_names: "data"
+  output_layer_names: "__factorization_machine_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
new file mode 100644
index 0000000000000000000000000000000000000000..b249de0fee3c8ca4ad0520872fa2497c493d31b5
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
@@ -0,0 +1,7 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='data', size=1024)
+
+fm = factorization_machine(input=data, factor_size=10)
+
+outputs(fm)
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 33a0829ba8d635ebd68b50f3da07da958fb79dcb..70f61e84997efdbe3d6f268d249be8bac15b9ecd 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -83,11 +83,10 @@ def set_omp_mkl_env_vars(trainer_count):
         '''Get the number of physical cores'''
         if platform.system() == "Linux":
             num_sockets = int(
-                os.popen("lscpu |grep \"Socket\" |awk -F':' '{print $2}'|xargs")
+                os.popen("grep 'physical id' /proc/cpuinfo | sort -u | wc -l")
                 .read())
             num_cores_per_socket = int(
-                os.popen(
-                    "lscpu |grep \"per socket\" |awk -F':' '{print $2}'|xargs")
+                os.popen("grep 'core id' /proc/cpuinfo | sort -u | wc -l")
                 .read())
             return num_sockets * num_cores_per_socket
         else:
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index 98b97c75ca72f11c105535e0f2a5fa0201db5d42..f10bf7e42a1ead09b3eba0d61e55701215e4360f 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -38,6 +38,7 @@ UCI_TEST_DATA = None
 URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
 MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
 
+
 def feature_range(maximums, minimums):
     import matplotlib
     matplotlib.use('Agg')
@@ -114,7 +115,8 @@ def test():
 
 
 def model():
-    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar', MD5_MODEL)
+    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar',
+                                                 MD5_MODEL)
     with open(tar_file, 'r') as f:
         parameters = Parameters.from_tar(f)
     return parameters
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index 9677c9568c6783921545364bca7b2c9c0041d823..dd25bc19ec5f4fd6eb3e04f304b1de488e988f41 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -13,13 +13,14 @@ import nets
 import optimizer
 import backward
 import regularizer
+from param_attr import ParamAttr
 
 from core import LoDTensor, CPUPlace, GPUPlace
 
 Tensor = LoDTensor
 __all__ = framework.__all__ + executor.__all__ + [
     'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
-    'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor'
+    'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr'
 ]
 
 
@@ -35,7 +36,8 @@ def __read_gflags_from_env__():
     read_env_flags = ['use_pinned_memory']
     if core.is_compile_gpu():
         read_env_flags.append('fraction_of_gpu_memory_to_use')
-    core.init_gflags(sys.argv + ["--tryfromenv=" + ",".join(read_env_flags)])
+    core.init_gflags([sys.argv[0]] +
+                     ["--tryfromenv=" + ",".join(read_env_flags)])
 
 
 __read_gflags_from_env__()
diff --git a/python/paddle/v2/fluid/evaluator.py b/python/paddle/v2/fluid/evaluator.py
index bd4a6fda1fd20e68d5a42e76f6ab516bb5c00cff..137c5736226b689340748d5098ca51659d5acff8 100644
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -26,9 +26,9 @@ class Evaluator(object):
         name(str): The name of evaluator. such as, "accuracy". Used for generate 
             temporary variable name.
         main_program(Program, optional): The evaluator should be added to this 
-            main_program. Default g_main_program 
+            main_program. Default default_main_program()
         startup_program(Program, optional):The parameter should be added to this 
-            startup_program. Default g_startup_program
+            startup_program. Default default_startup_program()
             
     Attributes:
         states(list): The list of state variables. states will be reset to zero 
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index 3e26d1b983a3c924ce2392c266bcd32e27c7b309..bdc82eede9d93a7cf904999a6b869ce2d23c90dc 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -1,6 +1,6 @@
 import numpy as np
 from . import core
-from framework import Program, g_main_program
+from framework import Program, default_main_program
 
 __all__ = ['Executor', 'g_scope']
 
@@ -103,7 +103,7 @@ class Executor(object):
             fetch_list = []
 
         if program is None:
-            program = g_main_program
+            program = default_main_program()
 
         if not isinstance(program, Program):
             raise TypeError()
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 9a62698b86b8fb38384f8c7d76ac14d3a0c95cac..49c6d8983457fa9c29451b8d020dd0c581481f9c 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -3,10 +3,12 @@ import collections
 import numpy as np
 from . import core
 import proto.framework_pb2 as framework_pb2
+import contextlib
 
 __all__ = [
     'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
-    'default_main_program', 'g_startup_program', 'g_main_program'
+    'default_main_program', 'program_guard', 'switch_startup_program',
+    'switch_main_program'
 ]
 
 
@@ -395,7 +397,11 @@ class Block(object):
         return v
 
     def all_parameters(self):
-        return {v for k, v in self.vars.iteritems() if isinstance(v, Parameter)}
+        return list(self.iter_parameters())
+
+    def iter_parameters(self):
+        return (item[1] for item in self.vars.iteritems()
+                if isinstance(item[1], Parameter))
 
     def create_var(self, *args, **kwargs):
         var = Variable(self, *args, **kwargs)
@@ -469,6 +475,37 @@ class Block(object):
         for index in range(len(self.ops)):
             assert self.ops[index].desc == ops_in_cpp[index]
 
+    def copy_param_info_from(self, other):
+        """
+        Copy the information of parameters from other block
+        Args:
+            other(Block): other block 
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Block):
+            raise TypeError("copy_param_info_from should be invoked with Block")
+        for p in other.iter_parameters():
+            assert isinstance(p, Parameter)
+            v = self.vars.get(p.name, None)
+            if v is None:
+                raise ValueError("copy_param_info_from should be invoked with "
+                                 "same topology")
+            assert isinstance(v, Variable)
+            new_p = Parameter(
+                block=self,
+                shape=v.shape,
+                dtype=v.dtype,
+                type=v.type,
+                lod_level=v.lod_level,
+                stop_gradient=p.stop_gradient,
+                trainable=p.trainable,
+                optimize_attr=p.optimize_attr,
+                regularizer=p.regularizer,
+                name=v.name)
+            self.vars[new_p.name] = new_p
+
 
 class Program(object):
     def __init__(self):
@@ -489,6 +526,7 @@ class Program(object):
         p.desc = core.ProgramDesc(self.desc)
         p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
         p.sync_with_cpp()
+        p.copy_param_info_from(self)
         return p
 
     def prune(self, targets):
@@ -572,6 +610,24 @@ class Program(object):
         for block in self.blocks:
             block.sync_with_cpp()
 
+    def copy_param_info_from(self, other):
+        """
+        Copy the information of parameters from other program. 
+        Args:
+            other(Program): Other program
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Program):
+            raise TypeError("copy_param_info_from should be invoked with "
+                            "Program")
+
+        if len(self.blocks) != len(other.blocks):
+            raise ValueError("copy_param_info_from should be invoked with two "
+                             "program, with represent the same topology")
+        self.global_block().copy_param_info_from(other.global_block())
+
     def list_vars(self):
         for each_block in self.blocks:
             for each_var in each_block.vars.itervalues():
@@ -600,13 +656,88 @@ class Parameter(Variable):
 
 
 # program is a global instance.
-g_main_program = Program()
-g_startup_program = Program()
+_main_program_ = Program()
+_startup_program_ = Program()
 
 
 def default_startup_program():
-    return g_startup_program
+    """
+    Get default startup program. In startup program, Paddle will initialize
+    parameters, initialize nccl handle, etc.
+    
+    Returns:
+        Program: startup program
+    """
+    return _startup_program_
 
 
 def default_main_program():
-    return g_main_program
+    """
+    Get default main program. The main program is used for training or testing.
+    
+    Returns:
+        Program: main program
+    """
+    return _main_program_
+
+
+def switch_main_program(program):
+    """
+    Switch the main program to a new program.
+    
+    Args:
+        program(Program): The new main program
+
+    Returns:
+        Program: The previous main program
+    """
+    global _main_program_
+    prev_program = _main_program_
+    _main_program_ = program
+    return prev_program
+
+
+def switch_startup_program(program):
+    """
+    Switch the startup program to a new program 
+    Args:
+        program(Program): The new startup program
+
+    Returns:
+        Program: The previous startup program
+    """
+    global _startup_program_
+    prev_program = _startup_program_
+    _startup_program_ = program
+    return prev_program
+
+
+@contextlib.contextmanager
+def program_guard(main_program, startup_program=None):
+    """
+    Switch program with `with` statement
+    
+    Examples:
+        >>> with program_guard(Program()):
+        >>>   data = fluid.layers.data(...)
+        >>>   hidden = fluid.layers.fc(...)
+        
+    Args:
+        main_program(Program): New main program inside `with` statement
+        startup_program(Program): New startup program inside `with` statement. 
+            None means do not change startup program.
+
+    Returns:
+        None
+    """
+    if not isinstance(main_program, Program):
+        raise TypeError("main_program should be Program")
+    main_program = switch_main_program(main_program)
+    if startup_program is not None:
+        if not isinstance(startup_program, Program):
+            raise TypeError("startup_program should be Program")
+        startup_program = switch_startup_program(startup_program)
+    yield
+    switch_main_program(main_program)
+    if startup_program is not None:
+        switch_startup_program(startup_program)
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py
index e5b2aa3b919df4cec1091c0bbd39b7e400cc6867..e147ac22ad289eb00c83def66974d875fcdc31f8 100644
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -1,8 +1,7 @@
 import os
 import cPickle as pickle
 
-from paddle.v2.fluid.framework import Program, Parameter, g_main_program, \
-    Variable
+from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
 
 __all__ = [
     'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
@@ -46,7 +45,7 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     """
     if vars is None:
         if main_program is None:
-            main_program = g_main_program
+            main_program = default_main_program()
         if not isinstance(main_program, Program):
             raise TypeError("program should be as Program type or None")
 
@@ -98,7 +97,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     :param executor: executor that save variable
     :param dirname: directory path
     :param main_program: program. If vars is None, then filter all variables in this
-    program which fit `predicate`. Default g_program.
+    program which fit `predicate`. Default default_main_program().
     :param predicate: The Predicate describes a callable that returns a variable
     as a bool. If it returns true, the variables will be loaded.
     :param vars: variables need to be loaded. If specify vars, program &
@@ -107,7 +106,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     """
     if vars is None:
         if main_program is None:
-            main_program = g_main_program
+            main_program = default_main_program()
         if not isinstance(main_program, Program):
             raise TypeError("program's type should be Program")
 
@@ -154,7 +153,7 @@ def load_persistables(executor, dirname, main_program=None):
 
 def get_inference_program(target_vars, main_program=None):
     if main_program is None:
-        main_program = g_main_program
+        main_program = default_main_program()
     if not isinstance(target_vars, list):
         target_vars = [target_vars]
 
@@ -177,12 +176,12 @@ def save_inference_model(dirname,
     :param target_vars: Variables from which we can get inference results.
     :param executor: executor that save inference model
     :param main_program: original program, which will be pruned to build the inference model.
-    Default g_main_program.
+            Default default_main_program().
 
     :return: None
     """
     if main_program is None:
-        main_program = g_main_program
+        main_program = default_main_program()
     if not isinstance(target_vars, list):
         target_vars = [target_vars]
 
@@ -272,10 +271,10 @@ def get_parameter_value_by_name(name, executor, program=None):
     :param executor: executor for retrieving the value
     :param name: the name of the parameter
     :param program: the program where the variable is found
-    Default g_main_program.
+            Default default_main_program().
     :return: the LoDTensor for the variable
     """
     if program is None:
-        program = g_main_program
+        program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
index 5f8855551114a9a9b671d1630c9e8a3f0cb5c04b..5b384e5cf5df5e5abc7f0ef81ff11cd8a31cfa2d 100644
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -1,9 +1,10 @@
 import copy
 import itertools
 
-from framework import Variable, g_main_program, \
-    g_startup_program, unique_name, dtype_is_floating
+from framework import Variable, default_main_program, default_startup_program, \
+    unique_name, dtype_is_floating
 from paddle.v2.fluid.initializer import Constant, Xavier
+from param_attr import ParamAttr
 
 
 class LayerHelper(object):
@@ -22,7 +23,7 @@ class LayerHelper(object):
     def main_program(self):
         prog = self.kwargs.get('main_program', None)
         if prog is None:
-            return g_main_program
+            return default_main_program()
         else:
             return prog
 
@@ -30,7 +31,7 @@ class LayerHelper(object):
     def startup_program(self):
         prog = self.kwargs.get('startup_program', None)
         if prog is None:
-            return g_startup_program
+            return default_startup_program()
         else:
             return prog
 
@@ -60,31 +61,15 @@ class LayerHelper(object):
 
     @property
     def param_attr(self):
-        default = {'name': None}
-        actual = self.kwargs.get('param_attr', None)
-        if actual is None:
-            actual = default
-        for default_field in default.keys():
-            if default_field not in actual:
-                actual[default_field] = default[default_field]
-        return actual
+        return ParamAttr.to_attr(self.kwargs.get('param_attr', None))
 
     @property
     def bias_attr(self):
-        default = {'name': None}
-        bias_attr = self.kwargs.get('bias_attr', None)
-        if bias_attr is None:
-            bias_attr = default
-
-        if isinstance(bias_attr, dict):
-            for default_field in default.keys():
-                if default_field not in bias_attr:
-                    bias_attr[default_field] = default[default_field]
-        return bias_attr
+        return ParamAttr.to_attr(self.kwargs.get('bias_attr', None))
 
     def multiple_param_attr(self, length):
         param_attr = self.param_attr
-        if isinstance(param_attr, dict):
+        if isinstance(param_attr, ParamAttr):
             param_attr = [param_attr]
 
         if len(param_attr) != 1 and len(param_attr) != length:
@@ -112,23 +97,30 @@ class LayerHelper(object):
                 raise ValueError("Data Type mismatch")
         return dtype
 
-    def create_parameter(self, attr, shape, dtype, suffix='w',
-                         initializer=None):
+    def create_parameter(self,
+                         attr,
+                         shape,
+                         dtype,
+                         is_bias=False,
+                         default_initializer=None):
         # Deepcopy the attr so that parameters can be shared in program
-        attr_copy = copy.deepcopy(attr)
-        if initializer is not None:
-            attr_copy['initializer'] = initializer
+        assert isinstance(attr, ParamAttr)
+        suffix = 'b' if is_bias else 'w'
+
+        if default_initializer is None:
+            if is_bias:
+                attr.set_default_bias_initializer()
+            else:
+                attr.set_default_param_initializer()
         else:
-            attr_copy['initializer'] = self._get_default_initializer(dtype)
-        if attr_copy['name'] is None:
-            attr_copy['name'] = unique_name(".".join([self.name, suffix]))
+            attr.set_default_initializer(default_initializer)
+        if attr.name is None:
+            attr.name = unique_name(".".join([self.name, suffix]))
+
         self.startup_program.global_block().create_parameter(
-            dtype=dtype, shape=shape, **attr_copy)
+            dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True))
         return self.main_program.global_block().create_parameter(
-            name=attr_copy['name'],
-            dtype=dtype,
-            shape=shape,
-            trainable=attr_copy.get('trainable', True))
+            dtype=dtype, shape=shape, **attr.to_kwargs())
 
     def create_tmp_variable(self, dtype):
         return self.main_program.current_block().create_var(
@@ -153,11 +145,7 @@ class LayerHelper(object):
             persistable=True,
             initializer=initializer)
 
-    def append_bias_op(self,
-                       input_var,
-                       bias_initializer,
-                       dim_start=1,
-                       dim_end=None):
+    def append_bias_op(self, input_var, dim_start=1, dim_end=None):
         """
         Append bias operator and return its output. If the user does not set
         bias_attr, append_bias_op will return input_var
@@ -177,11 +165,7 @@ class LayerHelper(object):
             return input_var
 
         b = self.create_parameter(
-            attr=bias_attr,
-            shape=size,
-            dtype=input_var.dtype,
-            suffix='b',
-            initializer=bias_initializer)
+            attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
         tmp = self.create_tmp_variable(dtype=input_var.dtype)
         self.append_op(
             type='elementwise_add',
diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py
index 28bc3d214b559a089efb2bb736eb49cb1ba4de25..9dcc11d21618ec12ac6a2112ed8e307ab028f6c0 100644
--- a/python/paddle/v2/fluid/layers.py
+++ b/python/paddle/v2/fluid/layers.py
@@ -1,10 +1,11 @@
-from . import core
+import core
 import proto.framework_pb2 as framework_pb2
 from framework import OpProtoHolder, Variable, Program, Operator
-from initializer import Constant, Normal, Xavier
+from initializer import Constant, Normal, Xavier, Initializer
 from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
 import re
 import cStringIO
+from param_attr import ParamAttr
 
 __all__ = [
     'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
@@ -17,9 +18,7 @@ def fc(input,
        size,
        num_flatten_dims=1,
        param_attr=None,
-       param_initializer=None,
        bias_attr=None,
-       bias_initializer=None,
        act=None,
        name=None,
        main_program=None,
@@ -54,23 +53,10 @@ def fc(input,
     to the LayerHelper constructor.
 
     """
-
-    def _get_default_param_initializer():
-        return Xavier()
-
-    def _get_default_bias_initializer():
-        return Constant()
-
     helper = LayerHelper('fc', **locals())
 
     dtype = helper.input_dtype()
 
-    if param_initializer is None:
-        param_initializer = _get_default_param_initializer()
-
-    if bias_initializer is None:
-        bias_initializer = _get_default_bias_initializer()
-
     mul_results = []
     for input_var, param_attr in helper.iter_inputs_and_params():
         input_shape = input_var.shape
@@ -78,10 +64,7 @@ def fc(input,
             reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
         ] + [size]
         w = helper.create_parameter(
-            attr=param_attr,
-            initializer=param_initializer,
-            shape=param_shape,
-            dtype=dtype)
+            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
         tmp = helper.create_tmp_variable(dtype)
         helper.append_op(
             type="mul",
@@ -102,7 +85,7 @@ def fc(input,
         helper.append_op(
             type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
     # add bias
-    pre_activation = helper.append_bias_op(pre_bias, bias_initializer)
+    pre_activation = helper.append_bias_op(pre_bias)
     # add activation
     return helper.append_activation(pre_activation)
 
@@ -110,7 +93,6 @@ def fc(input,
 def embedding(input,
               size,
               is_sparse=False,
-              param_initializer=None,
               param_attr=None,
               dtype='float32',
               main_program=None,
@@ -119,6 +101,7 @@ def embedding(input,
     Embedding Layer.
 
     Args:
+       param_initializer:
        input: The input to the function
        size: The size of the layer
        is_sparse: A flag that decleares whether the input is sparse
@@ -136,15 +119,9 @@ def embedding(input,
 
     """
 
-    def _get_default_param_initializer():
-        return Xavier()
-
     helper = LayerHelper('embedding', **locals())
     w = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=size,
-        dtype=dtype,
-        initializer=param_initializer or _get_default_param_initializer())
+        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
     tmp = helper.create_tmp_variable(dtype)
     helper.append_op(
         type='lookup_table',
@@ -176,7 +153,7 @@ def dynamic_lstm(input,
     if not use_peepholes:
         bias_size[1] = 4 * size
     bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=bias_size, dtype=dtype, suffix='b')
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
 
     hidden = helper.create_tmp_variable(dtype)
     cell = helper.create_tmp_variable(dtype)
@@ -471,19 +448,14 @@ def sums(input, out=None, main_program=None, startup_program=None):
 def linear_chain_crf(input,
                      label,
                      param_attr=None,
-                     param_initializer=None,
                      main_program=None,
                      startup_program=None):
-    def _get_default_param_initializer():
-        return Xavier()
-
     helper = LayerHelper('linear_chain_crf', **locals())
     size = input.shape[1]
     transition = helper.create_parameter(
         attr=helper.param_attr,
         shape=[size + 2, size],
-        dtype=helper.input_dtype(),
-        initializer=param_initializer or _get_default_param_initializer())
+        dtype=helper.input_dtype())
     alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
     emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
     transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -646,9 +618,7 @@ def sequence_conv(input,
                   filter_stride=1,
                   padding=None,
                   bias_attr=None,
-                  bias_initializer=None,
                   param_attr=None,
-                  param_initializer=None,
                   act=None,
                   main_program=None,
                   startup_program=None):
@@ -658,30 +628,15 @@ def sequence_conv(input,
     in the input parameters to the function.
     """
 
-    def _get_default_bias_initializer():
-        return Constant()
-
-    def _get_default_param_initializer():
-        return Xavier()
-
     # FIXME(dzh) : want to unify the argument of python layer
     # function. So we ignore some unecessary attributes.
     # such as, padding_trainable, context_start.
 
     helper = LayerHelper('sequence_conv', **locals())
     dtype = helper.input_dtype()
-
-    if param_initializer is None:
-        param_initializer = _get_default_param_initializer()
-    if bias_initializer is None:
-        bias_initializer = _get_default_bias_initializer()
-
     filter_shape = [filter_size * input.shape[1], num_filters]
     filter = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=filter_shape,
-        dtype=dtype,
-        initializer=param_initializer)
+        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
     pre_bias = helper.create_tmp_variable(dtype)
 
     helper.append_op(
@@ -696,7 +651,7 @@ def sequence_conv(input,
             'contextStart': -int(filter_size / 2),
             'contextLength': filter_size
         })
-    pre_act = helper.append_bias_op(pre_bias, bias_initializer)
+    pre_act = helper.append_bias_op(pre_bias)
     return helper.append_activation(pre_act)
 
 
@@ -707,9 +662,7 @@ def conv2d(input,
            padding=None,
            groups=None,
            param_attr=None,
-           param_initializer=None,
            bias_attr=None,
-           bias_initializer=None,
            act=None,
            name=None,
            main_program=None,
@@ -722,13 +675,6 @@ def conv2d(input,
     conv-2d output, if mentioned in the input parameters.
     """
 
-    def _get_default_bias_initializer():
-        return Constant()
-
-    def _get_default_param_initializer(filter_size, num_channels):
-        std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
-        return Normal(0.0, std, 0)
-
     helper = LayerHelper('conv2d', **locals())
     dtype = helper.input_dtype()
 
@@ -750,17 +696,16 @@ def conv2d(input,
     input_shape = input.shape
     filter_shape = [num_filters, num_filter_channels] + filter_size
 
-    if param_initializer is None:
-        param_initializer = _get_default_param_initializer(filter_size,
-                                                           num_channels)
-    if bias_initializer is None:
-        bias_initializer = _get_default_bias_initializer()
+    def _get_default_param_initializer():
+        std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
+        return Normal(0.0, std, 0)
 
     filter = helper.create_parameter(
         attr=helper.param_attr,
         shape=filter_shape,
         dtype=dtype,
-        initializer=param_initializer)
+        default_initializer=_get_default_param_initializer())
+
     pre_bias = helper.create_tmp_variable(dtype)
 
     helper.append_op(
@@ -774,8 +719,7 @@ def conv2d(input,
                'paddings': padding,
                'groups': groups})
 
-    pre_act = helper.append_bias_op(
-        pre_bias, bias_initializer, dim_start=1, dim_end=2)
+    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
 
     return helper.append_activation(pre_act)
 
@@ -876,12 +820,10 @@ def batch_norm(input,
         attr=helper.param_attr,
         shape=param_shape,
         dtype=dtype,
-        initializer=Constant(1.0))
+        default_initializer=Constant(1.0))
+
     bias = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype=dtype,
-        initializer=Constant(0.0))
+        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True)
 
     mean = helper.create_global_variable(
         dtype=input.dtype, shape=param_shape, persistable=True)
@@ -1356,7 +1298,7 @@ def lod_rank_table(x, level=0, main_program=None):
 
 def max_sequence_len(rank_table, main_program=None):
     """
-    This function creates an operator to calculate the length of 
+    This function creates an operator to calculate the length of
     max seqence through input rank_table(should be a lod_rank_table)
     """
     helper = LayerHelper("max_seqence_len", **locals())
@@ -1587,6 +1529,93 @@ def array_length(array, main_program=None):
     return tmp
 
 
+def conv2d_transpose(input,
+                     num_filters,
+                     output_size=None,
+                     filter_size=None,
+                     padding=None,
+                     stride=None,
+                     param_attr=None,
+                     main_program=None,
+                     startup_program=None):
+    """
+    The transpose of conv2d layer.
+
+    This layer is also known as deconvolution layer.
+
+    Args:
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). This
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.  None if use output size to
+            calculate filter_size
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride.
+        param_attr: Parameter Attribute.
+        main_program(Program): the main program
+        startup_program(Program): the startup program
+
+    Returns:
+        Variable: Output image.
+    """
+    helper = LayerHelper("conv2d_transpose", **locals())
+    if not isinstance(input, Variable):
+        raise TypeError("Input of conv2d_transpose must be Variable")
+    input_channel = input.shape[1]
+
+    op_attr = dict()
+
+    if isinstance(padding, int):
+        op_attr['paddings'] = [padding, padding]
+    elif padding is not None:
+        op_attr['paddings'] = padding
+
+    if isinstance(stride, int):
+        op_attr['strides'] = stride
+    elif stride is not None:
+        op_attr['strides'] = stride
+
+    if filter_size is None:
+        if output_size is None:
+            raise ValueError("output_size must be set when filter_size is None")
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]
+
+        padding = op_attr.get('paddings', [0, 0])
+        stride = op_attr.get('strides', [1, 1])
+
+        h_in = input.shape[2]
+        w_in = input.shape[3]
+        filter_size_h = output_size[0] - (h_in - 1) * stride[0] + 2 * padding[0]
+        filter_size_w = output_size[1] - (w_in - 1) * stride[1] + 2 * padding[1]
+        filter_size = [filter_size_h, filter_size_w]
+    elif isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+
+    filter_shape = [input_channel, num_filters] + filter_size
+    img_filter = helper.create_parameter(
+        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
+
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='conv2d_transpose',
+        inputs={'Input': [input],
+                'Filter': [img_filter]},
+        outputs={'Output': out},
+        attrs=op_attr)
+
+    return out
+
+
 class ConditionalBlockGuard(BlockGuard):
     def __init__(self, block):
         if not isinstance(block, ConditionalBlock):
diff --git a/python/paddle/v2/fluid/param_attr.py b/python/paddle/v2/fluid/param_attr.py
new file mode 100644
index 0000000000000000000000000000000000000000..86088fdd7ce17b8b7a9688dc838e69b2aa754013
--- /dev/null
+++ b/python/paddle/v2/fluid/param_attr.py
@@ -0,0 +1,61 @@
+from initializer import Initializer, Xavier, Constant
+from regularizer import WeightDecayRegularizer
+
+
+class ParamAttr(object):
+    def __init__(self,
+                 name=None,
+                 initializer=None,
+                 learning_rate=1.0,
+                 regularizer=None,
+                 trainable=True):
+        self.name = name
+        self.initializer = initializer
+        self.learning_rate = learning_rate
+        self.regularizer = regularizer
+        self.trainable = trainable
+
+    def set_default_initializer(self, initializer):
+        if initializer is None:
+            if self.initializer is None:
+                raise ValueError("ParamAttr.initializer is not set")
+            return
+
+        if self.initializer is not None:
+            return
+
+        self.initializer = initializer
+
+    def set_default_param_initializer(self):
+        self.set_default_initializer(Xavier())
+
+    def set_default_bias_initializer(self):
+        self.set_default_initializer(Constant(0.0))
+
+    @staticmethod
+    def to_attr(arg):
+        if arg is None:
+            return ParamAttr()
+        elif isinstance(arg, ParamAttr):
+            return arg
+        elif isinstance(arg, str) or isinstance(arg, unicode):
+            return ParamAttr(name=arg)
+        elif isinstance(arg, Initializer):
+            return ParamAttr(initializer=arg)
+        elif isinstance(arg, WeightDecayRegularizer):
+            return ParamAttr(regularizer=arg)
+        elif isinstance(arg, bool):
+            return ParamAttr.to_attr(None) if arg else False
+        else:
+            raise TypeError("{0} cast to ParamAttr".format(type(arg)))
+
+    def to_kwargs(self, with_initializer=False):
+        kwargs = {
+            'name': self.name,
+            'learning_rate': self.learning_rate,
+            'regularizer': self.regularizer,
+            'trainable': self.trainable
+        }
+        if with_initializer:
+            kwargs['initializer'] = self.initializer
+        return kwargs
diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..2069b713faf41c5c00ceaf47e030864b98c678da
--- /dev/null
+++ b/python/paddle/v2/fluid/profiler.py
@@ -0,0 +1,46 @@
+import paddle.v2.fluid.core as core
+from contextlib import contextmanager
+
+__all__ = ['CudaProfiler']
+
+NVPROF_CONFIG = [
+    "gpustarttimestamp",
+    "gpuendtimestamp",
+    "gridsize3d",
+    "threadblocksize",
+    "streamid",
+    "enableonstart 0",
+    "conckerneltrace",
+]
+
+
+@contextmanager
+def cuda_profiler(output_file, output_mode=None, config=None):
+    """The CUDA profiler.
+    This fuctions is used to profile CUDA program by CUDA runtime application
+    programming interface. The profiling result will be written into
+    `output_file` with Key-Value pair format or Comma separated values format.
+    The user can set the output mode by `output_mode` argument and set the
+    counters/options for profiling by `config` argument. The default config
+    is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
+    'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
+
+    Args:
+        output_file (string) : The output file name, the result will be
+            written into this file.
+        output_mode (string) : The output mode has Key-Value pair format and
+            Comma separated values format. It should be 'kvp' or 'csv'.
+        config (string) : The profiler options and counters can refer to
+            "Compute Command Line Profiler User Guide".
+    """
+    if output_mode is None:
+        output_mode = 'csv'
+    if output_mode not in ['kvp', 'csv']:
+        raise ValueError("The output mode must be 'kvp' or 'csv'.")
+    config = NVPROF_CONFIG if config is None else config
+    core.nvprof_init(output_file, output_mode, config)
+    # Enables profiler collection by the active CUDA profiling tool.
+    core.nvprof_start()
+    yield
+    # Disables profiler collection.
+    core.nvprof_stop()
diff --git a/python/paddle/v2/fluid/tests/book/CMakeLists.txt b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
index 4d7664469e481344cf9eea84688f068b4fb99dee..a35abe3e0c436be4eaed01c9b9183344c6d3b275 100644
--- a/python/paddle/v2/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
@@ -1,5 +1,11 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+list(REMOVE_ITEM TEST_OPS test_image_classification_train)
+py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet)
+py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg)
+
+# default test
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
index 690c53397198889ac6005aaacbfa9d6e02b7da3d..0f0cc5b5406ef51ac3504a95ea716056ae8730af 100644
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -1,7 +1,9 @@
 from __future__ import print_function
+
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import sys
 
 
 def resnet_cifar10(input, depth=32):
@@ -67,8 +69,7 @@ def vgg16_bn_drop(input):
 
     drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
     fc1 = fluid.layers.fc(input=drop, size=512, act=None)
-    reshape1 = fluid.layers.reshape(x=fc1, shape=list(fc1.shape + (1, 1)))
-    bn = fluid.layers.batch_norm(input=reshape1, act='relu')
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
     drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
     fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
     return fc2
@@ -80,11 +81,18 @@ data_shape = [3, 32, 32]
 images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
 label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
-# Add neural network config
-# option 1. resnet
-# net = resnet_cifar10(images, 32)
-# option 2. vgg
-net = vgg16_bn_drop(images)
+net_type = "vgg"
+if len(sys.argv) >= 2:
+    net_type = sys.argv[1]
+
+if net_type == "vgg":
+    print("train vgg net")
+    net = vgg16_bn_drop(images)
+elif net_type == "resnet":
+    print("train resnet")
+    net = resnet_cifar10(images, 32)
+else:
+    raise ValueError("%s network is not supported" % net_type)
 
 predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
 cost = fluid.layers.cross_entropy(input=predict, label=label)
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
index 93987a2b80dc9ca304a708d4799bc38b448a68c4..bcd6f4d6bc66fd01406332bd1d6d7a5c4b0ddb5a 100644
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -44,7 +44,7 @@ def db_lstm():
         size=[pred_len, word_dim],
         dtype='float32',
         is_sparse=IS_SPARSE,
-        param_attr={'name': 'vemb'})
+        param_attr='vemb')
 
     mark_embedding = fluid.layers.embedding(
         input=mark,
@@ -57,8 +57,8 @@ def db_lstm():
         fluid.layers.embedding(
             size=[word_dict_len, word_dim],
             input=x,
-            param_attr={'name': embedding_name,
-                        'trainable': False}) for x in word_input
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
     ]
     emb_layers.append(predicate_embedding)
     emb_layers.append(mark_embedding)
@@ -125,8 +125,8 @@ def main():
     crf_cost = fluid.layers.linear_chain_crf(
         input=feature_out,
         label=target,
-        param_attr={"name": 'crfw',
-                    "learning_rate": mix_hidden_lr})
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=mix_hidden_lr))
     avg_cost = fluid.layers.mean(x=crf_cost)
     # TODO(qiao)
     #   1. add crf_decode_layer and evaluator
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
index c96d186ffe8d9313cb818a55d68dfc3c13db19cc..fa18965aac667c0829b9e6ee56ece585564f9060 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
@@ -6,24 +6,21 @@ import paddle.v2.fluid as fluid
 BATCH_SIZE = 128
 image = fluid.layers.data(name='x', shape=[784], dtype='float32')
 
-param_attr = {
-    'name': None,
-    'regularization': fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
-}
+regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
 
 hidden1 = fluid.layers.fc(input=image,
                           size=128,
                           act='relu',
-                          param_attr=param_attr)
+                          param_attr=regularizer)
 hidden2 = fluid.layers.fc(input=hidden1,
                           size=64,
                           act='relu',
-                          param_attr=param_attr)
+                          param_attr=regularizer)
 
 predict = fluid.layers.fc(input=hidden2,
                           size=10,
                           act='softmax',
-                          param_attr=param_attr)
+                          param_attr=regularizer)
 
 label = fluid.layers.data(name='y', shape=[1], dtype='int64')
 
@@ -35,6 +32,13 @@ opts = optimizer.minimize(avg_cost)
 
 accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
 
+inference_program = fluid.default_main_program().clone()
+test_accuracy = fluid.evaluator.Accuracy(
+    input=predict, label=label, main_program=inference_program)
+test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
+inference_program = fluid.io.get_inference_program(
+    test_target, main_program=inference_program)
+
 train_reader = paddle.batch(
     paddle.reader.shuffle(
         paddle.dataset.mnist.train(), buf_size=8192),
@@ -69,11 +73,6 @@ for pass_id in range(PASS_NUM):
         acc = np.array(outs[1])
         pass_acc = accuracy.eval(exe)
 
-        test_accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
-        test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
-        inference_program = fluid.io.get_inference_program(test_target)
-
         test_accuracy.reset(exe)
         for data in test_reader():
             x_data = np.array(map(lambda x: x[0], data)).astype("float32")
diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
index f8dc1518579d5a9d7a8d0498dcc5fd8a6d1692c4..db91ca4f9c7d17fb51fc5d65a0464e976d98523c 100644
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -24,7 +24,7 @@ def get_usr_combined_features():
         input=uid,
         dtype='float32',
         size=[USR_DICT_SIZE, 32],
-        param_attr={'name': 'user_table'},
+        param_attr='user_table',
         is_sparse=IS_SPARSE)
 
     usr_fc = layers.fc(input=usr_emb, size=32)
@@ -36,7 +36,7 @@ def get_usr_combined_features():
     usr_gender_emb = layers.embedding(
         input=usr_gender_id,
         size=[USR_GENDER_DICT_SIZE, 16],
-        param_attr={'name': 'gender_table'},
+        param_attr='gender_table',
         is_sparse=IS_SPARSE)
 
     usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
@@ -48,7 +48,7 @@ def get_usr_combined_features():
         input=usr_age_id,
         size=[USR_AGE_DICT_SIZE, 16],
         is_sparse=IS_SPARSE,
-        param_attr={'name': 'age_table'})
+        param_attr='age_table')
 
     usr_age_fc = layers.fc(input=usr_age_emb, size=16)
 
@@ -58,7 +58,7 @@ def get_usr_combined_features():
     usr_job_emb = layers.embedding(
         input=usr_job_id,
         size=[USR_JOB_DICT_SIZE, 16],
-        param_attr={'name': 'job_table'},
+        param_attr='job_table',
         is_sparse=IS_SPARSE)
 
     usr_job_fc = layers.fc(input=usr_job_emb, size=16)
@@ -81,7 +81,7 @@ def get_mov_combined_features():
         input=mov_id,
         dtype='float32',
         size=[MOV_DICT_SIZE, 32],
-        param_attr={'name': 'movie_table'},
+        param_attr='movie_table',
         is_sparse=IS_SPARSE)
 
     mov_fc = layers.fc(input=mov_emb, size=32)
diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py
index b0cd1a518cd1be60474df126470573a5a5b81b70..92d3629d42613e896e93e0149928b50940058169 100644
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -23,25 +23,25 @@ embed_first = fluid.layers.embedding(
     size=[dict_size, EMBED_SIZE],
     dtype='float32',
     is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
+    param_attr='shared_w')
 embed_second = fluid.layers.embedding(
     input=second_word,
     size=[dict_size, EMBED_SIZE],
     dtype='float32',
     is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
+    param_attr='shared_w')
 embed_third = fluid.layers.embedding(
     input=third_word,
     size=[dict_size, EMBED_SIZE],
     dtype='float32',
     is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
+    param_attr='shared_w')
 embed_forth = fluid.layers.embedding(
     input=forth_word,
     size=[dict_size, EMBED_SIZE],
     dtype='float32',
     is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
+    param_attr='shared_w')
 
 concat_embed = fluid.layers.concat(
     input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
diff --git a/python/paddle/v2/fluid/tests/test_array_read_write_op.py b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
index b7790b01062d480cbd6c9e1a626d318385b4f61e..f6120aedecf1015c279b8f218f5e37f2e598ab91 100644
--- a/python/paddle/v2/fluid/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
@@ -3,7 +3,7 @@ import paddle.v2.fluid.core as core
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.backward import append_backward_ops
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import default_main_program
 import numpy
 
 
@@ -66,7 +66,7 @@ class TestArrayReadWrite(unittest.TestCase):
 
         append_backward_ops(total_sum_scaled)
 
-        g_vars = map(g_main_program.global_block().var,
+        g_vars = map(default_main_program().global_block().var,
                      [each_x.name + "@GRAD" for each_x in x])
         g_out = [
             item.sum()
diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
index 71f9599e0de83c86808f7e62547f80d3d50ffc7d..e766a68c0e338b07e47260e40edc544c98555382 100644
--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -21,6 +21,13 @@ def get_backward_op(scope, op, no_grad_set):
 
 
 def _reference_training(x, scale, offset, epsilon, data_format):
+    x_shape = x.shape
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+
     if data_format == "NCHW":
         n, c, h, w = x.shape
         x_square = x * x
@@ -39,6 +46,8 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         offset_tile = np.reshape(offset, (1, c, 1, 1))
         offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
         y = normalized * scale_tile + offset_tile
+        if len(x_shape) == 2:
+            y = np.reshape(y, (y.shape[0], y.shape[1]))
         return y, mean, var
     elif data_format == "NHWC":
         x_square = x * x
@@ -48,7 +57,10 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         mean = x_sum / element_count
         var = x_square_sum / element_count - mean * mean
         normalized = (x - mean) / np.sqrt(var + epsilon)
-        return (normalized * scale + offset), mean, var
+        y = normalized * scale + offset
+        if len(x_shape) == 2:
+            y = np.reshape(y, x_shape)
+        return y, mean, var
     else:
         raise ValueError("Unknown data order.")
 
@@ -65,6 +77,18 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
     #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
 
     # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
+    x_shape = x.shape
+
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+            grad_y = np.reshape(grad_y,
+                                (grad_y.shape[0], grad_y.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+            grad_y = np.reshape(grad_y,
+                                (grad_y.shape[0], 1, 1, grad_y.shape[1]))
+
     if data_format == "NCHW":
         x = np.transpose(x, (0, 2, 3, 1))
         grad_y = np.transpose(grad_y, (0, 2, 3, 1))
@@ -83,6 +107,9 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
         grad_x = np.transpose(grad_x, (0, 3, 1, 2))
         x = np.transpose(x, (0, 3, 1, 2))
         grad_y = np.transpose(grad_y, (0, 3, 1, 2))
+
+    if len(x_shape) == 2:
+        grad_x = np.reshape(grad_x, x_shape)
     return grad_x, grad_scale, grad_offset
 
 
@@ -127,7 +154,7 @@ class TestBatchNormOp(OpTest):
         momentum = 0.9
 
         # N, H, W, C: 2, 3, 4, 2
-        n, h, w, c = 2, 3, 4, 2
+        n, h, w, c = 2, 3, 4, 5
         x_shape = [n, h, w, c]
         scale_shape = [c]
 
@@ -184,20 +211,23 @@ class TestBatchNormOp(OpTest):
         print 'python: NHWC, NCHW, backward checking passed'
 
     def test_forward_backward(self):
-        def test_with_place(place, tensor_format):
+        def test_with_place(place, tensor_format, shape):
             # attr
             epsilon = 0.00001
             momentum = 0.9
 
-            # N, H, W, C: 12, 3, 4, 2
-            n, h, w, c = 2, 3, 4, 2
-
-            if data_format == "NHWC":
-                x_shape = [n, h, w, c]
-            elif data_format == "NCHW":
-                x_shape = [n, c, h, w]
+            if len(shape) == 2:
+                x_shape = shape
+                c = shape[1]
             else:
-                raise ValueError("Unknown data type.")
+                # n, h, w, c = 2, 3, 4, 2
+                n, h, w, c = shape[0], shape[1], shape[2], shape[3]
+                if data_format == "NHWC":
+                    x_shape = [n, h, w, c]
+                elif data_format == "NCHW":
+                    x_shape = [n, c, h, w]
+                else:
+                    raise ValueError("Unknown data type.")
             scale_shape = [c]
 
             x_val = np.random.random_sample(x_shape).astype(np.float32)
@@ -219,7 +249,10 @@ class TestBatchNormOp(OpTest):
             #  for gradient test
             # y_grad = np.ones(x_shape).astype(np.float32)
             y_grad = np.zeros(x_shape).astype(np.float32)
-            y_grad[0, 0, 0, 0] = 1.
+            if len(y_grad.shape) == 2:
+                y_grad[0, 0] = 1.
+            else:
+                y_grad[0, 0, 0, 0] = 1.
             # y_grad = np.random.random_sample(x_shape).astype(np.float32)
             x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
                 x_val, y_grad, scale_val, saved_mean, var_ref, epsilon,
@@ -313,7 +346,8 @@ class TestBatchNormOp(OpTest):
             places.append(core.GPUPlace(0))
         for place in places:
             for data_format in ["NCHW", "NHWC"]:
-                test_with_place(place, data_format)
+                test_with_place(place, data_format, [2, 3, 4, 5])
+                test_with_place(place, data_format, [2, 3])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_conditional_block.py b/python/paddle/v2/fluid/tests/test_conditional_block.py
index d953ee7ddc37d150d87cbd680379410a4d16f6b1..2b9d8f351a2836cd723d629d4790de1e068d0ea3 100644
--- a/python/paddle/v2/fluid/tests/test_conditional_block.py
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
@@ -1,7 +1,7 @@
 import unittest
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
-from paddle.v2.fluid.framework import g_startup_program, g_main_program
+from paddle.v2.fluid.framework import default_startup_program, default_main_program
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.backward import append_backward_ops
 import numpy
@@ -19,7 +19,7 @@ class ConditionalBlock(unittest.TestCase):
 
         cpu = core.CPUPlace()
         exe = Executor(cpu)
-        exe.run(g_startup_program)
+        exe.run(default_startup_program())
 
         x = numpy.random.random(size=(10, 1)).astype('float32')
 
@@ -29,7 +29,9 @@ class ConditionalBlock(unittest.TestCase):
         append_backward_ops(loss=loss)
         outs = exe.run(
             feed={'X': x},
-            fetch_list=[g_main_program.block(0).var(data.name + "@GRAD")])[0]
+            fetch_list=[
+                default_main_program().block(0).var(data.name + "@GRAD")
+            ])[0]
         print outs
 
 
diff --git a/python/paddle/v2/fluid/tests/test_executor_and_mul.py b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
index 558273e30dff7fb74f78751f4fe569f79a453d0d..b1ef87c5cb1711c419b401c5950839816f7f4160 100644
--- a/python/paddle/v2/fluid/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
@@ -1,9 +1,10 @@
 import unittest
-from paddle.v2.fluid.layers import mul, data, sequence_pool
+
+import numpy
 import paddle.v2.fluid.core as core
+
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.framework import g_main_program
-import numpy
+from paddle.v2.fluid.layers import mul, data
 
 
 class TestExecutor(unittest.TestCase):
@@ -19,10 +20,7 @@ class TestExecutor(unittest.TestCase):
         a_np = numpy.random.random((100, 784)).astype('float32')
         b_np = numpy.random.random((784, 100)).astype('float32')
         exe = Executor(place)
-        outs = exe.run(g_main_program,
-                       feed={'a': a_np,
-                             'b': b_np},
-                       fetch_list=[out])
+        outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out])
         out = outs[0]
         self.assertEqual((100, 100), out.shape)
         self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
diff --git a/python/paddle/v2/fluid/tests/test_image_classification_layer.py b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
index 8e8e1b0a8c07a60cb1404462f976d10fe26e87f6..2fd609d4474e97ecd96adcd146f2f550e0772740 100644
--- a/python/paddle/v2/fluid/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
@@ -1,6 +1,6 @@
 import unittest
 
-import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.framework import Program
 
@@ -29,27 +29,35 @@ class TestLayer(unittest.TestCase):
     def test_batch_norm_layer(self):
         main_program = Program()
         startup_program = Program()
-        images = layers.data(
+        images = fluid.layers.data(
             name='pixel',
             shape=[3, 48, 48],
             dtype='float32',
             main_program=main_program)
-        layers.batch_norm(
+        hidden1 = fluid.layers.batch_norm(
             input=images,
             main_program=main_program,
             startup_program=startup_program)
+        hidden2 = fluid.layers.fc(input=hidden1,
+                                  size=128,
+                                  act='relu',
+                                  main_program=main_program)
+        hidden3 = fluid.layers.batch_norm(
+            input=hidden2,
+            main_program=main_program,
+            startup_program=startup_program)
 
-        # print str(main_program)
+        print str(main_program)
 
     def test_dropout_layer(self):
         main_program = Program()
         startup_program = Program()
-        images = layers.data(
+        images = fluid.layers.data(
             name='pixel',
             shape=[3, 48, 48],
             dtype='float32',
             main_program=main_program)
-        layers.dropout(
+        fluid.layers.dropout(
             x=images,
             dropout_prob=0.5,
             main_program=main_program,
@@ -61,7 +69,7 @@ class TestLayer(unittest.TestCase):
         main_program = Program()
         startup_program = Program()
 
-        images = layers.data(
+        images = fluid.layers.data(
             name='pixel',
             shape=[3, 48, 48],
             dtype='float32',
@@ -77,19 +85,19 @@ class TestLayer(unittest.TestCase):
     def test_elementwise_add_with_act(self):
         main_program = Program()
         startup_program = Program()
-        image1 = layers.data(
+        image1 = fluid.layers.data(
             name='pixel1',
             shape=[3, 48, 48],
             dtype='float32',
             main_program=main_program,
             startup_program=startup_program)
-        image2 = layers.data(
+        image2 = fluid.layers.data(
             name='pixel2',
             shape=[3, 48, 48],
             dtype='float32',
             main_program=main_program,
             startup_program=startup_program)
-        out = layers.elementwise_add(
+        out = fluid.layers.elementwise_add(
             x=image1,
             y=image2,
             act='relu',
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 87dc6d1a6270e0f8425b56601d04049450c73380..33b0e54f42afc82beaa24e334023f30a4035f039 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -1,183 +1,141 @@
+from __future__ import print_function
 import unittest
 
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard
 
 
 class TestBook(unittest.TestCase):
     def test_fit_a_line(self):
         program = Program()
-        x = layers.data(
-            name='x', shape=[13], dtype='float32', main_program=program)
-        y_predict = layers.fc(input=x, size=1, act=None, main_program=program)
+        with program_guard(program, startup_program=Program()):
+            x = layers.data(name='x', shape=[13], dtype='float32')
+            y_predict = layers.fc(input=x, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+            program.append_backward(avg_cost)
 
-        y = layers.data(
-            name='y', shape=[1], dtype='float32', main_program=program)
-        cost = layers.square_error_cost(
-            input=y_predict, label=y, main_program=program)
-
-        avg_cost = layers.mean(x=cost, main_program=program)
-        self.assertIsNotNone(avg_cost)
-        program.append_backward(avg_cost)
-
-        print str(program)
+        print(str(program))
 
     def test_recognize_digits_mlp(self):
         program = Program()
-
-        # Change g_program, so the rest layers use `g_program`
-        images = layers.data(
-            name='pixel', shape=[784], dtype='float32', main_program=program)
-        label = layers.data(
-            name='label', shape=[1], dtype='int32', main_program=program)
-        hidden1 = layers.fc(input=images,
-                            size=128,
-                            act='relu',
-                            main_program=program)
-        hidden2 = layers.fc(input=hidden1,
-                            size=64,
-                            act='relu',
-                            main_program=program)
-        predict = layers.fc(input=hidden2,
-                            size=10,
-                            act='softmax',
-                            main_program=program)
-        cost = layers.cross_entropy(
-            input=predict, label=label, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
-        self.assertIsNotNone(avg_cost)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            # Change g_program, so the rest layers use `g_program`
+            images = layers.data(name='pixel', shape=[784], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            hidden1 = layers.fc(input=images, size=128, act='relu')
+            hidden2 = layers.fc(input=hidden1, size=64, act='relu')
+            predict = layers.fc(input=hidden2, size=10, act='softmax')
+            cost = layers.cross_entropy(input=predict, label=label)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+
+        print(str(program))
 
     def test_simple_conv2d(self):
         program = Program()
-        images = layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            dtype='int32',
-            main_program=program)
-        layers.conv2d(
-            input=images,
-            num_filters=3,
-            filter_size=[4, 4],
-            main_program=program)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(name='pixel', shape=[3, 48, 48], dtype='int32')
+            layers.conv2d(input=images, num_filters=3, filter_size=[4, 4])
 
-    def test_recognize_digits_conv(self):
+        print(str(program))
+
+    def test_conv2d_transpose(self):
         program = Program()
+        with program_guard(program):
+            img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
+            layers.conv2d_transpose(input=img, num_filters=10, output_size=28)
+        print(str(program))
 
-        images = layers.data(
-            name='pixel',
-            shape=[1, 28, 28],
-            dtype='float32',
-            main_program=program)
-        label = layers.data(
-            name='label', shape=[1], dtype='int32', main_program=program)
-        conv_pool_1 = nets.simple_img_conv_pool(
-            input=images,
-            filter_size=5,
-            num_filters=2,
-            pool_size=2,
-            pool_stride=2,
-            act="relu",
-            main_program=program)
-        conv_pool_2 = nets.simple_img_conv_pool(
-            input=conv_pool_1,
-            filter_size=5,
-            num_filters=4,
-            pool_size=2,
-            pool_stride=2,
-            act="relu",
-            main_program=program)
-
-        predict = layers.fc(input=conv_pool_2,
-                            size=10,
-                            act="softmax",
-                            main_program=program)
-        cost = layers.cross_entropy(
-            input=predict, label=label, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
-
-        program.append_backward(avg_cost)
-
-        print str(program)
+    def test_recognize_digits_conv(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(
+                name='pixel', shape=[1, 28, 28], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            conv_pool_1 = nets.simple_img_conv_pool(
+                input=images,
+                filter_size=5,
+                num_filters=2,
+                pool_size=2,
+                pool_stride=2,
+                act="relu")
+            conv_pool_2 = nets.simple_img_conv_pool(
+                input=conv_pool_1,
+                filter_size=5,
+                num_filters=4,
+                pool_size=2,
+                pool_stride=2,
+                act="relu")
+
+            predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
+            cost = layers.cross_entropy(input=predict, label=label)
+            avg_cost = layers.mean(x=cost)
+
+            program.append_backward(avg_cost)
+
+        print(str(program))
 
     def test_word_embedding(self):
         program = Program()
-        dict_size = 10000
-        embed_size = 32
-        first_word = layers.data(
-            name='firstw', shape=[1], dtype='int64', main_program=program)
-        second_word = layers.data(
-            name='secondw', shape=[1], dtype='int64', main_program=program)
-        third_word = layers.data(
-            name='thirdw', shape=[1], dtype='int64', main_program=program)
-        forth_word = layers.data(
-            name='forthw', shape=[1], dtype='int64', main_program=program)
-        next_word = layers.data(
-            name='nextw', shape=[1], dtype='int64', main_program=program)
-
-        embed_first = layers.embedding(
-            input=first_word,
-            size=[dict_size, embed_size],
-            dtype='float32',
-            param_attr={'name': 'shared_w'},
-            main_program=program)
-        embed_second = layers.embedding(
-            input=second_word,
-            size=[dict_size, embed_size],
-            dtype='float32',
-            param_attr={'name': 'shared_w'},
-            main_program=program)
-
-        embed_third = layers.embedding(
-            input=third_word,
-            size=[dict_size, embed_size],
-            dtype='float32',
-            param_attr={'name': 'shared_w'},
-            main_program=program)
-        embed_forth = layers.embedding(
-            input=forth_word,
-            size=[dict_size, embed_size],
-            dtype='float32',
-            param_attr={'name': 'shared_w'},
-            main_program=program)
-
-        concat_embed = layers.concat(
-            input=[embed_first, embed_second, embed_third, embed_forth],
-            axis=1,
-            main_program=program)
-
-        hidden1 = layers.fc(input=concat_embed,
-                            size=256,
-                            act='sigmoid',
-                            main_program=program)
-        predict_word = layers.fc(input=hidden1,
-                                 size=dict_size,
-                                 act='softmax',
-                                 main_program=program)
-        cost = layers.cross_entropy(
-            input=predict_word, label=next_word, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
-        self.assertIsNotNone(avg_cost)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            dict_size = 10000
+            embed_size = 32
+            first_word = layers.data(name='firstw', shape=[1], dtype='int64')
+            second_word = layers.data(name='secondw', shape=[1], dtype='int64')
+            third_word = layers.data(name='thirdw', shape=[1], dtype='int64')
+            forth_word = layers.data(name='forthw', shape=[1], dtype='int64')
+            next_word = layers.data(name='nextw', shape=[1], dtype='int64')
+
+            embed_first = layers.embedding(
+                input=first_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+            embed_second = layers.embedding(
+                input=second_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+
+            embed_third = layers.embedding(
+                input=third_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+            embed_forth = layers.embedding(
+                input=forth_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+
+            concat_embed = layers.concat(
+                input=[embed_first, embed_second, embed_third, embed_forth],
+                axis=1)
+
+            hidden1 = layers.fc(input=concat_embed, size=256, act='sigmoid')
+            predict_word = layers.fc(input=hidden1,
+                                     size=dict_size,
+                                     act='softmax')
+            cost = layers.cross_entropy(input=predict_word, label=next_word)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+
+        print(str(program))
 
     def test_linear_chain_crf(self):
         program = Program()
-
-        # Change g_program, so the rest layers use `g_program`
-        images = layers.data(
-            name='pixel', shape=[784], dtype='float32', main_program=program)
-        label = layers.data(
-            name='label', shape=[1], dtype='int32', main_program=program)
-        hidden = layers.fc(input=images, size=128, main_program=program)
-        crf = layers.linear_chain_crf(
-            input=hidden, label=label, main_program=program)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(name='pixel', shape=[784], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            hidden = layers.fc(input=images, size=128)
+            crf = layers.linear_chain_crf(input=hidden, label=label)
+            self.assertNotEqual(crf, None)
+
+        print(str(program))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_lod_rank_table.py b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
index bbc11930b9e804c2769cc590c298c6e90dc36ca6..30d619fe318517345195281b17f88e9916b6afb3 100644
--- a/python/paddle/v2/fluid/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
@@ -1,6 +1,5 @@
 from paddle.v2.fluid.layers import lod_rank_table, data
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.framework import g_main_program
 import paddle.v2.fluid.core as core
 import numpy
 import unittest
@@ -18,7 +17,7 @@ class TestLoDRankTable(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(numpy.random.random(size=(17, 100)), cpu)
         tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])
-        exe.run(g_main_program, scope=scope, feed={'x': tensor})
+        exe.run(scope=scope, feed={'x': tensor})
         var = scope.find_var(rank_table.name)
         table = var.get_lod_rank_table()
         self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items())
diff --git a/python/paddle/v2/fluid/tests/test_log_loss_op.py b/python/paddle/v2/fluid/tests/test_log_loss_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eeaa90758c57ef0d92a8ad7b0a4c1b1f2c38be3
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_log_loss_op.py
@@ -0,0 +1,33 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLogLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'log_loss'
+        samples_num = 32
+
+        predicted = np.random.uniform(0.1, 1.0,
+                                      (samples_num, 1)).astype("float32")
+        labels = np.random.randint(0, 2, (samples_num, 1)).astype("float32")
+        epsilon = 1e-4
+        self.inputs = {
+            'Predicted': predicted,
+            'Labels': labels,
+        }
+
+        self.attrs = {'epsilon': epsilon}
+        loss = -labels * np.log(predicted + epsilon) - (
+            1 - labels) * np.log(1 - predicted + epsilon)
+        self.outputs = {'Loss': loss}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Predicted'], 'Loss', max_relative_error=0.03)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_maxout_op.py b/python/paddle/v2/fluid/tests/test_maxout_op.py
index 05e42f315833cab5bc5272cbd2173ea8012ff7f5..5fbed43e254b811d38e441e946a73c24f87373de 100644
--- a/python/paddle/v2/fluid/tests/test_maxout_op.py
+++ b/python/paddle/v2/fluid/tests/test_maxout_op.py
@@ -30,9 +30,7 @@ class TestMaxOutOp(OpTest):
     def init_test_case(self):
         self.MaxOut_forward_naive = maxout_forward_naive
         self.shape = [100, 6, 2, 2]
-        self.groups=2
-
-
+        self.groups = 2
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_operator_desc.py b/python/paddle/v2/fluid/tests/test_operator_desc.py
index e8362d2e9c6038c04c24dce35de8c53bfde78142..ce34d95ac8cb2644dee9c551cd8e85b33609919a 100644
--- a/python/paddle/v2/fluid/tests/test_operator_desc.py
+++ b/python/paddle/v2/fluid/tests/test_operator_desc.py
@@ -1,11 +1,15 @@
 import unittest
-from paddle.v2.fluid.framework import Variable, Program, g_main_program
+
 import paddle.v2.fluid.core as core
 
+from paddle.v2.fluid.framework import Program, default_startup_program
+
+main_program = default_startup_program()
+
 
 class TestOperator(unittest.TestCase):
     def test_error_type(self):
-        block = g_main_program.create_block()
+        block = main_program.create_block()
         try:
             block.append_op()
             self.assertFail()
diff --git a/python/paddle/v2/fluid/tests/test_parameter.py b/python/paddle/v2/fluid/tests/test_parameter.py
index 13f6278ad8b7244e7980b32463f29d7a824b4572..694344acbbd3b7c80cb0ff48ada843f794061282 100644
--- a/python/paddle/v2/fluid/tests/test_parameter.py
+++ b/python/paddle/v2/fluid/tests/test_parameter.py
@@ -1,17 +1,19 @@
 import unittest
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import default_main_program
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.io as io
 from paddle.v2.fluid.initializer import ConstantInitializer
 import numpy as np
 
+main_program = default_main_program()
+
 
 class TestParameter(unittest.TestCase):
     def test_param(self):
         shape = [784, 100]
         val = 1.0625
-        b = g_main_program.global_block()
+        b = main_program.global_block()
         param = b.create_parameter(
             name='fc.w',
             shape=shape,
@@ -23,9 +25,9 @@ class TestParameter(unittest.TestCase):
         self.assertEqual(core.DataType.FP32, param.dtype)
         self.assertEqual(0, param.block.idx)
         exe = Executor(core.CPUPlace())
-        p = exe.run(g_main_program, fetch_list=[param])[0]
+        p = exe.run(main_program, fetch_list=[param])[0]
         self.assertTrue(np.allclose(p, np.ones(shape) * val))
-        p = io.get_parameter_value_by_name('fc.w', exe, g_main_program)
+        p = io.get_parameter_value_by_name('fc.w', exe, main_program)
         self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))
 
 
diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..395d0dc36a3d1d6fbfebb4cdf34395c4edee412d
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
@@ -0,0 +1,28 @@
+import unittest
+import numpy as np
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.profiler as profiler
+import paddle.v2.fluid.layers as layers
+
+
+class TestProfiler(unittest.TestCase):
+    def test_nvprof(self):
+        if not fluid.core.is_compile_gpu():
+            return
+        epoc = 8
+        dshape = [4, 3, 28, 28]
+        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+        place = fluid.GPUPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+            for i in range(epoc):
+                input = np.random.random(dshape).astype('float32')
+                exe.run(fluid.default_main_program(), feed={'data': input})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_program.py b/python/paddle/v2/fluid/tests/test_program.py
index e9bcefd21569aaa9225c676ea03b5c8e37d00333..1a9313c68aab165d85ae29051faeacb4927ac2c9 100644
--- a/python/paddle/v2/fluid/tests/test_program.py
+++ b/python/paddle/v2/fluid/tests/test_program.py
@@ -1,35 +1,38 @@
+from __future__ import print_function
 import unittest
 
-from paddle.v2.fluid.framework import Program
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import Program, default_main_program
+import paddle.v2.fluid.layers as layers
+
+main_program = default_main_program()
 
 
 class TestProgram(unittest.TestCase):
     def test_program(self):
-        b = g_main_program.current_block()
+        b = main_program.current_block()
         self.assertEqual(-1, b.parent_idx)
         self.assertEqual(0, b.idx)
 
-        b = g_main_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
-        b = g_main_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(2, b.idx)
         self.assertEqual(1, b.parent_idx)
 
-        g_main_program.rollback()
+        main_program.rollback()
 
-        b = g_main_program.current_block()
+        b = main_program.current_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
-        b = g_main_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(3, b.idx)
         self.assertEqual(1, b.parent_idx)
 
-        g_main_program.rollback()
-        b = g_main_program.current_block()
+        main_program.rollback()
+        b = main_program.current_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
@@ -48,8 +51,8 @@ class TestProgram(unittest.TestCase):
 
         # FIXME(yuyang18): We manual compare the output string, since the order
         # of variable could be changed.
-        print prog
-        print prog.clone()
+        print(prog)
+        print(prog.clone())
 
     def test_parse_program_from_string(self):
         prog = Program()
@@ -67,8 +70,8 @@ class TestProgram(unittest.TestCase):
         binary_str = prog.desc.serialize_to_string()
         prog_restored = Program.parse_from_string(binary_str)
 
-        print prog
-        print prog_restored
+        print(prog)
+        print(prog_restored)
 
     def test_append_backward(self):
         prog = Program()
@@ -123,6 +126,20 @@ class TestProgram(unittest.TestCase):
             actual_ops.append(op.type)
         self.assertEqual(actual_ops, expect_ops)
 
+    def test_program_clone_with_parameter(self):
+        main_program = Program()
+        startup_program = Program()
+        kwargs = {
+            'main_program': main_program,
+            'startup_program': startup_program
+        }
+        d = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
+        hidden = layers.fc(input=d, size=100, **kwargs)
+        layers.fc(input=hidden, size=100, **kwargs)
+
+        new_program = main_program.clone()
+        self.assertNotEqual(0, len(new_program.blocks[0].all_parameters()))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_recurrent_op.py b/python/paddle/v2/fluid/tests/test_recurrent_op.py
index 84548847f76c6315da000e1b3d062deafe55a05e..36e0c84c0b8e7d40aa56d75c8904a38694881be4 100644
--- a/python/paddle/v2/fluid/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
@@ -271,12 +271,12 @@ class RecurrentOpTest2(RecurrentOpTest1):
 
             temp_l = layers.fc(input=x_t,
                                size=self.input_dim,
-                               param_attr={'name': 'W'},
+                               param_attr='W',
                                bias_attr=False,
                                **self.p_info)
             temp_r = layers.fc(input=h_pre,
                                size=self.input_dim,
-                               param_attr={'name': 'U'},
+                               param_attr='U',
                                bias_attr=False,
                                **self.p_info)
 
diff --git a/python/paddle/v2/fluid/tests/test_roi_pool_op.py b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
index 7cedb930ca861aed95c355931d80cb4d265c8235..a28d9c7f82d3735c410369eb61e350168c267cea 100644
--- a/python/paddle/v2/fluid/tests/test_roi_pool_op.py
+++ b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
@@ -4,24 +4,22 @@ import math
 import sys
 from op_test import OpTest
 
+
 class TestROIPoolOp(OpTest):
     def set_data(self):
         self.init_test_case()
         self.make_rois()
         self.calc_roi_pool()
 
-        self.inputs = {
-            'X': self.x, 
-            'ROIs': self.rois}
-        
+        self.inputs = {'X': self.x, 'ROIs': self.rois}
+
         self.attrs = {
             'spatial_scale': self.spatial_scale,
             'pooled_height': self.pooled_height,
-            'pooled_width': self.pooled_width}
+            'pooled_width': self.pooled_width
+        }
 
-        self.outputs = {
-            'Out': self.outs,
-            'Argmax': self.argmaxes}
+        self.outputs = {'Out': self.outs, 'Argmax': self.argmaxes}
 
     def init_test_case(self):
         self.batch_size = 5
@@ -30,10 +28,9 @@ class TestROIPoolOp(OpTest):
         self.width = 4
 
         # n, c, h, w
-        self.x_dim = (self.batch_size, self.channels,
-                      self.height, self.width)
+        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
 
-        self.spatial_scale = 1.0/4.0
+        self.spatial_scale = 1.0 / 4.0
         self.pooled_height = 2
         self.pooled_width = 2
         self.rois_num = 2
@@ -41,13 +38,11 @@ class TestROIPoolOp(OpTest):
         self.x = np.random.random(self.x_dim).astype('float32')
 
     def calc_roi_pool(self):
-        out_data = np.zeros(
-            (self.rois_num, self.channels,
-            self.pooled_height, self.pooled_width))
-        argmax_data = np.zeros(
-            (self.rois_num, self.channels,
-            self.pooled_height, self.pooled_width))
-            
+        out_data = np.zeros((self.rois_num, self.channels, self.pooled_height,
+                             self.pooled_width))
+        argmax_data = np.zeros((self.rois_num, self.channels,
+                                self.pooled_height, self.pooled_width))
+
         for i in range(self.rois_num):
             roi = self.rois[i]
             roi_batch_id = roi[0]
@@ -56,8 +51,8 @@ class TestROIPoolOp(OpTest):
             roi_end_w = int(round(roi[3] * self.spatial_scale))
             roi_end_h = int(round(roi[4] * self.spatial_scale))
 
-            roi_height = int(max(roi_end_h - roi_start_h + 1, 1));
-            roi_width = int(max(roi_end_w - roi_start_w + 1, 1));
+            roi_height = int(max(roi_end_h - roi_start_h + 1, 1))
+            roi_width = int(max(roi_end_w - roi_start_w + 1, 1))
 
             x_i = self.x[roi_batch_id]
 
@@ -84,7 +79,7 @@ class TestROIPoolOp(OpTest):
                             out_data[i, c, ph, pw] = -sys.float_info.max
 
                         argmax_data[i, c, ph, pw] = -1
-                        
+
                         for h in range(hstart, hend):
                             for w in range(wstart, wend):
                                 if x_i[c, h, w] > out_data[i, c, ph, pw]:
@@ -104,11 +99,11 @@ class TestROIPoolOp(OpTest):
             y1 = np.random.random_integers(
                 0, self.height / self.spatial_scale - self.pooled_height)
 
-            x2 = np.random.random_integers(
-                x1 + self.pooled_width, self.width / self.spatial_scale)
-            y2 = np.random.random_integers(
-                y1 + self.pooled_height, self.height / self.spatial_scale)
-            
+            x2 = np.random.random_integers(x1 + self.pooled_width,
+                                           self.width / self.spatial_scale)
+            y2 = np.random.random_integers(y1 + self.pooled_height,
+                                           self.height / self.spatial_scale)
+
             roi = [batch_ids[i], x1, y1, x2, y2]
             rois.append(roi)
         self.rois = np.array(rois).astype("int64")
@@ -123,5 +118,6 @@ class TestROIPoolOp(OpTest):
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
index 05f6a560644f18da6ff2e015911901cd73cc36c9..86db4c64b493d94cc675ed4bcee7e2925fef1977 100644
--- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
@@ -3,9 +3,11 @@ import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.backward import append_backward_ops
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import default_main_program
 import numpy
 
+main_program = default_main_program()
+
 
 class TestShrinkRNNMemory(unittest.TestCase):
     def test_shrink_rnn_memory(self):
@@ -36,7 +38,7 @@ class TestShrinkRNNMemory(unittest.TestCase):
         append_backward_ops(loss=mem3_mean)
         x_grad = exe.run(
             feed={'x': tensor},
-            fetch_list=[g_main_program.global_block().var('x@GRAD')])[0]
+            fetch_list=[main_program.global_block().var('x@GRAD')])[0]
         self.assertAlmostEqual(1.0, x_grad.sum(), delta=0.1)
 
 
diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e87f283042c081ed9f232d140ff8c303cd3d1858
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
@@ -0,0 +1,83 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
+    s0, s1, s2, s3 = input.shape
+    out_hsize = (s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
+    out_wsize = (s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
+    out = np.zeros((s0, s1, out_hsize, out_wsize))
+    for nidx in xrange(s0):
+        for cidx in xrange(s1):
+            for h in xrange(s2):
+                for w in xrange(s3):
+                    index = indices[nidx, cidx, h, w]
+                    hidx = (index - index % out_wsize) / out_wsize
+                    widx = index % out_wsize
+                    out[nidx, cidx, int(hidx), int(widx)] = \
+                            input[nidx, cidx, h, w]
+
+    return out
+
+
+class TestUnpoolOp(OpTest):
+    def setUp(self):
+        self.op_type = "unpool"
+        self.init_test_case()
+        pre_input = np.random.random(self.shape).astype("float32")
+        nsize, csize, hsize, wsize = pre_input.shape
+        hsize_out = (hsize - self.ksize[0] + 2 * self.paddings[0]) / \
+                self.strides[0] + 1
+        wsize_out = (wsize - self.ksize[1] + 2 * self.paddings[1]) / \
+                self.strides[1] + 1
+        input = np.zeros((nsize, csize, hsize_out, wsize_out))
+        indices = np.zeros((nsize, csize, hsize_out, wsize_out))
+        for i in xrange(hsize_out):
+            for j in xrange(wsize_out):
+                r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
+                r_end = np.min((i * self.strides[0] + self.ksize[0] - \
+                        self.paddings[0], hsize))
+                c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
+                c_end = np.min((j * self.strides[1] + self.ksize[1] - \
+                        self.paddings[1], wsize))
+                for nidx in xrange(nsize):
+                    for cidx in xrange(csize):
+                        x_masked = pre_input[nidx, cidx, r_start:r_end, \
+                                c_start:c_end]
+                        input[nidx, cidx, i, j] = x_masked.max()
+                        arg = x_masked.argmax()
+                        indices[nidx, cidx, i, j] = \
+                                (r_start + arg / self.ksize[1]) * wsize + \
+                                c_start + arg % self.ksize[1]
+        output = self.unpool2d_forward_naive(input, indices, self.ksize, \
+                self.strides, self.paddings).astype("float32")
+        self.inputs = {
+            'X': input.astype('float32'),
+            'Indices': indices.astype('int32')
+        }
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'unpooling_type': self.unpooling_type,
+        }
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def init_test_case(self):
+        self.unpool2d_forward_naive = unpool2dmax_forward_naive
+        self.unpooling_type = "max"
+        self.shape = [6, 4, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [2, 2]
+        self.paddings = [0, 0]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_variable.py b/python/paddle/v2/fluid/tests/test_variable.py
index 92ffdceb6c84fb2669f8c1bb556c46fb1c03c411..f1e4c0ba21d5c4f10d2b5011bdb5abaebaec5431 100644
--- a/python/paddle/v2/fluid/tests/test_variable.py
+++ b/python/paddle/v2/fluid/tests/test_variable.py
@@ -1,5 +1,5 @@
 import unittest
-from paddle.v2.fluid.framework import g_main_program, Program, convert_np_dtype_to_dtype_
+from paddle.v2.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
 import paddle.v2.fluid.core as core
 import numpy as np
 
@@ -18,7 +18,7 @@ class TestVariable(unittest.TestCase):
         self.assertRaises(ValueError, lambda: convert("int8"))
 
     def test_var(self):
-        b = g_main_program.current_block()
+        b = default_main_program().current_block()
         w = b.create_var(
             dtype="float64", shape=[784, 100], lod_level=0, name="fc.w")
         self.assertNotEqual(str(w), "")